From 797476b88bde2a6001f9552f383f147e58c1a330 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 18 Oct 2016 15:40:29 +0900 Subject: block: Add 'zoned' queue limit Add the zoned queue limit to indicate the zoning model of a block device. Defined values are 0 (BLK_ZONED_NONE) for regular block devices, 1 (BLK_ZONED_HA) for host-aware zone block devices and 2 (BLK_ZONED_HM) for host-managed zone block devices. The standards defined drive managed model is not defined here since these block devices do not provide any command for accessing zone information. Drive managed model devices will be reported as BLK_ZONED_NONE. The helper functions blk_queue_zoned_model and bdev_zoned_model return the zoned limit and the functions blk_queue_is_zoned and bdev_is_zoned return a boolean for callers to test if a block device is zoned. The zoned attribute is also exported as a string to applications via sysfs. BLK_ZONED_NONE shows as "none", BLK_ZONED_HA as "host-aware" and BLK_ZONED_HM as "host-managed". Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..f19e16bb43d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -261,6 +261,15 @@ struct blk_queue_tag { #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) +/* + * Zoned block device models (zoned limit). + */ +enum blk_zoned_model { + BLK_ZONED_NONE, /* Regular block device */ + BLK_ZONED_HA, /* Host-aware zoned block device */ + BLK_ZONED_HM, /* Host-managed zoned block device */ +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -290,6 +299,7 @@ struct queue_limits { unsigned char cluster; unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; + enum blk_zoned_model zoned; }; struct request_queue { @@ -627,6 +637,23 @@ static inline unsigned int blk_queue_cluster(struct request_queue *q) return q->limits.cluster; } +static inline enum blk_zoned_model +blk_queue_zoned_model(struct request_queue *q) +{ + return q->limits.zoned; +} + +static inline bool blk_queue_is_zoned(struct request_queue *q) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + case BLK_ZONED_HM: + return true; + default: + return false; + } +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1354,6 +1381,26 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zoned_model(q); + + return BLK_ZONED_NONE; +} + +static inline bool bdev_is_zoned(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_is_zoned(q); + + return false; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:33 +0900 Subject: block: Implement support for zoned block devices Implement zoned block device zone information reporting and reset. Zone information are reported as struct blk_zone. This implementation does not differentiate between host-aware and host-managed device models and is valid for both. Two functions are provided: blkdev_report_zones for discovering the zone configuration of a zoned block device, and blkdev_reset_zones for resetting the write pointer of sequential zones. The helper function blk_queue_zone_size and bdev_zone_size are also provided for, as the name suggest, obtaining the zone size (in 512B sectors) of the zones of the device. Signed-off-by: Hannes Reinecke [Damien: * Removed the zone cache * Implement report zones operation based on earlier proposal by Shaun Tancheff ] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f19e16bb43d1..252043f7cd2c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -302,6 +303,21 @@ struct queue_limits { enum blk_zoned_model zoned; }; +#ifdef CONFIG_BLK_DEV_ZONED + +struct blk_zone_report_hdr { + unsigned int nr_zones; + u8 padding[60]; +}; + +extern int blkdev_report_zones(struct block_device *bdev, + sector_t sector, struct blk_zone *zones, + unsigned int *nr_zones, gfp_t gfp_mask); +extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, + sector_t nr_sectors, gfp_t gfp_mask); + +#endif /* CONFIG_BLK_DEV_ZONED */ + struct request_queue { /* * Together with queue_head for cacheline sharing @@ -654,6 +670,11 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } +static inline unsigned int blk_queue_zone_size(struct request_queue *q) +{ + return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1401,6 +1422,16 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline unsigned int bdev_zone_size(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zone_size(q); + + return 0; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:35 +0900 Subject: blk-zoned: implement ioctls Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively obtaining the zone configuration of a zoned block device and resetting the write pointer of sequential zones of a zoned block device. The BLKREPORTZONE ioctl maps directly to a single call of the function blkdev_report_zones. The zone information result is passed as an array of struct blk_zone identical to the structure used internally for processing the REQ_OP_ZONE_REPORT operation. The BLKRESETZONE ioctl maps to a single call of the blkdev_reset_zones function. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 252043f7cd2c..90097dd8b8ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,27 @@ extern int blkdev_report_zones(struct block_device *bdev, extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); + +#else /* CONFIG_BLK_DEV_ZONED */ + +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { -- cgit v1.2.3 From 5dc8b362a2374d007bc0db649b7ab6a79dd32bda Mon Sep 17 00:00:00 2001 From: Adam Manzanares Date: Mon, 17 Oct 2016 11:27:28 -0700 Subject: block: Add iocontext priority to request Patch adds an association between iocontext ioprio and the ioprio of a request. This is done to enable request based drivers the ability to act on priority information stored in the request. An example being ATA devices that support command priorities. If the ATA driver discovers that the device supports command priorities and the request has valid priority information indicating the request is high priority, then a high priority command can be sent to the device. This should improve tail latencies for high priority IO on any device that queues requests internally and can make use of the priority information stored in the request. The ioprio of the request is set in blk_rq_set_prio which takes the request and the ioc as arguments. If the ioc is valid in blk_rq_set_prio then the iopriority of the request is set as the iopriority of the ioc. In init_request_from_bio a check is made to see if the ioprio of the bio is valid and if so then the request prio comes from the bio. Signed-off-by: Adam Manzananares Reviewed-by: Jens Axboe Signed-off-by: Tejun Heo --- include/linux/blkdev.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..9a0ceaa1b7e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -933,6 +933,20 @@ static inline unsigned int blk_rq_count_bios(struct request *rq) return nr_bios; } +/* + * blk_rq_set_prio - associate a request with prio from ioc + * @rq: request of interest + * @ioc: target iocontext + * + * Assocate request prio with ioc prio so request based drivers + * can leverage priority information. + */ +static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc) +{ + if (ioc) + rq->ioprio = ioc->ioprio; +} + /* * Request issue related functions. */ -- cgit v1.2.3 From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:13 +0200 Subject: block: split out request-only flags into a new namespace A lot of the REQ_* flags are only used on struct requests, and only of use to the block layer and a few drivers that dig into struct request internals. This patch adds a new req_flags_t rq_flags field to struct request for them, and thus dramatically shrinks the number of common requests. It also removes the unfortunate situation where we have to fit the fields from the same enum into 32 bits for struct bio and 64 bits for struct request. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 90097dd8b8ed..b4415feac679 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -78,6 +78,50 @@ enum rq_cmd_type_bits { REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* elevator knows about this request */ +#define RQF_SORTED ((__force req_flags_t)(1 << 0)) +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* uses tagged queueing */ +#define RQF_QUEUED ((__force req_flags_t)(1 << 2)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* set for "ide_preempt" requests and also for requests for which the SCSI + "quiesce" state must be ignored. */ +#define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) +/* contains copies of user pages */ +#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account I/O stat */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* request came from our alloc pool */ +#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + #define BLK_MAX_CDB 16 /* @@ -99,6 +143,7 @@ struct request { int cpu; unsigned cmd_type; u64 cmd_flags; + req_flags_t rq_flags; unsigned long atomic_flags; /* the following two fields are internal, NEVER access directly */ @@ -648,7 +693,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) REQ_FAILFAST_DRIVER)) #define blk_account_rq(rq) \ - (((rq)->cmd_flags & REQ_STARTED) && \ + (((rq)->rq_flags & RQF_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) @@ -740,6 +785,8 @@ static inline bool rq_mergeable(struct request *rq) if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; return true; } -- cgit v1.2.3 From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2016 08:48:16 -0600 Subject: block: better op and flags encoding Now that we don't need the common flags to overflow outside the range of a 32-bit type we can encode them the same way for both the bio and request fields. This in addition allows us to place the operation first (and make some room for more ops while we're at it) and to stop having to shift around the operation values. In addition this allows passing around only one value in the block layer instead of two (and eventuall also in the file systems, but we can do that later) and thus clean up a lot of code. Last but not least this allows decreasing the size of the cmd_flags field in struct request to 32-bits. Various functions passing this value could also be updated, but I'd like to avoid the churn for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b4415feac679..8396da2bb698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -142,7 +142,7 @@ struct request { int cpu; unsigned cmd_type; - u64 cmd_flags; + unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; unsigned long atomic_flags; @@ -244,20 +244,6 @@ struct request { struct request *next_rq; }; -#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS) -#define req_op(req) ((req)->cmd_flags >> REQ_OP_SHIFT) - -#define req_set_op(req, op) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1); \ - (req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT); \ -} while (0) - -#define req_set_op_attrs(req, op, flags) do { \ - req_set_op(req, op); \ - (req)->cmd_flags |= flags; \ -} while (0) - static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -/* - * We regard a request as sync, if either a read or a sync write - */ -static inline bool rw_is_sync(int op, unsigned int rw_flags) -{ - return op == REQ_OP_READ || (rw_flags & REQ_SYNC); -} - static inline bool rq_is_sync(struct request *rq) { - return rw_is_sync(req_op(rq), rq->cmd_flags); + return op_is_sync(rq->cmd_flags); } static inline bool blk_rl_full(struct request_list *rl, bool sync) -- cgit v1.2.3 From 6a83e74d214a47a1371cd2e6a783264fcba7d428 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 2 Nov 2016 10:09:51 -0600 Subject: blk-mq: Introduce blk_mq_quiesce_queue() blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations have finished. This function does *not* wait until all outstanding requests have finished (this means invocation of request.end_io()). The algorithm used by blk_mq_quiesce_queue() is as follows: * Hold either an RCU read lock or an SRCU read lock around .queue_rq() calls. The former is used if .queue_rq() does not block and the latter if .queue_rq() may block. * blk_mq_quiesce_queue() first calls blk_mq_stop_hw_queues() followed by synchronize_srcu() or synchronize_rcu(). The latter call waits for .queue_rq() invocations that started before blk_mq_quiesce_queue() was called. * The blk_mq_hctx_stopped() calls that control whether or not .queue_rq() will be called are called with the (S)RCU read lock held. This is necessary to avoid race conditions against blk_mq_quiesce_queue(). Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8396da2bb698..13d893a69b46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -918,6 +918,7 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); +extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); -- cgit v1.2.3 From 50d24c34403c62ad29e8b6db559d491bae20b4b7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 17:03:53 -0700 Subject: block: immediately dispatch big size request Currently block plug holds up to 16 non-mergeable requests. This makes sense if the request size is small, eg, reduce lock contention. But if request size is big enough, we don't need to worry about lock contention. Holding such request makes no sense and it lows the disk utilization. In practice, this improves 10% throughput for my raid5 sequential write workload. The size (128k) is arbitrary right now, but it makes sure lock contention is small. This probably could be more intelligent, eg, check average request size holded. Since this is mainly for sequential IO, probably not worthy. V2: check the last request instead of the first request, so as long as there is one big size request we flush the plug. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13d893a69b46..9189a2d5c392 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1173,6 +1173,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ }; #define BLK_MAX_REQUEST_COUNT 16 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); -- cgit v1.2.3 From d278d4a8892f13b6a9eb6102b356402f0e062324 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:21:08 -0600 Subject: block: add code to track actual device queue depth For blk-mq, ->nr_requests does track queue depth, at least at init time. But for the older queue paths, it's simply a soft setting. On top of that, it's generally larger than the hardware setting on purpose, to allow backup of requests for merging. Fill a hole in struct request with a 'queue_depth' member, that drivers can call to more closely inform the block layer of the real queue depth. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9189a2d5c392..d364be6e6959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -405,6 +405,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -777,6 +779,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -1094,6 +1104,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, -- cgit v1.2.3 From cf43e6be865a582ba66ee4747ae27a0513f6bba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Nov 2016 21:32:37 -0700 Subject: block: add scalable completion tracking of requests For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state. The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. The feature is off by default, to avoid any extra overhead. In-kernel users of it can turn it on by setting QUEUE_FLAG_STATS in the queue flags. We currently don't turn it on if someone just reads any of the stats files, that is something we could add as well. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d364be6e6959..303723a2e5b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -117,6 +117,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* IO stats tracking on */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -197,6 +199,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + struct blk_issue_stat issue_stat; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -492,6 +495,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -585,6 +591,7 @@ struct request_queue { #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ +#define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From 87760e5eef359788047d6fd54fc12eec74ce0d27 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Nov 2016 12:38:14 -0700 Subject: block: hook up writeback throttling Enable throttling of buffered writeback to make it a lot more smooth, and has way less impact on other system activity. Background writeback should be, by definition, background activity. The fact that we flush huge bundles of it at the time means that it potentially has heavy impacts on foreground workloads, which isn't ideal. We can't easily limit the sizes of writes that we do, since that would impact file system layout in the presence of delayed allocation. So just throttle back buffered writeback, unless someone is waiting for it. The algorithm for when to throttle takes its inspiration in the CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors the minimum latencies of requests over a window of time. In that window of time, if the minimum latency of any request exceeds a given target, then a scale count is incremented and the queue depth is shrunk. The next monitoring window is shrunk accordingly. Unlike CoDel, if we hit a window that exhibits good behavior, then we simply increment the scale count and re-calculate the limits for that scale value. This prevents us from oscillating between a close-to-ideal value and max all the time, instead remaining in the windows where we get good behavior. Unlike CoDel, blk-wb allows the scale count to to negative. This happens if we primarily have writes going on. Unlike positive scale counts, this doesn't change the size of the monitoring window. When the heavy writers finish, blk-bw quickly snaps back to it's stable state of a zero scale count. The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency target to me met. It defaults to 2 msec for non-rotational storage, and 75 msec for rotational storage. Setting this value to '0' disables blk-wb. Generally, a user would not have to touch this setting. We don't enable WBT on devices that are managed with CFQ, and have a non-root block cgroup attached. If we have a proportional share setup on this particular disk, then the wbt throttling will interfere with that. We don't have a strong need for wbt for that case, since we will rely on CFQ doing that for us. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 303723a2e5b8..15da9e430f90 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -383,6 +384,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other -- cgit v1.2.3 From bbd7bb7017d5c2b1e75f3818b4ce88fa58bb0eab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Nov 2016 09:34:34 -0600 Subject: block: move poll code to blk-mq The poll code is blk-mq specific, let's move it to blk-mq.c. This is a prep patch for improving the polling code. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 15da9e430f90..bab18ee5810d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,7 +952,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -bool blk_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit v1.2.3 From 06426adf072bca62ac31ea396ff2159a34f276c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:01:59 -0700 Subject: blk-mq: implement hybrid poll mode for sync O_DIRECT This patch enables a hybrid polling mode. Instead of polling after IO submission, we can induce an artificial delay, and then poll after that. For example, if the IO is presumed to complete in 8 usecs from now, we can sleep for 4 usecs, wake up, and then do our polling. This still puts a sleep/wakeup cycle in the IO path, but instead of the wakeup happening after the IO has completed, it'll happen before. With this hybrid scheme, we can achieve big latency reductions while still using the same (or less) amount of CPU. Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bab18ee5810d..37ed4ea705c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,6 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; + unsigned int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit v1.2.3 From 64f1c21e86f7fe63337b5c23c129de3ec506431d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:03:03 -0700 Subject: blk-mq: make the polling code adaptive The previous commit introduced the hybrid sleep/poll mode. Take that one step further, and use the completion latencies to automatically sleep for half the mean completion time. This is a good approximation. This changes the 'io_poll_delay' sysfs file a bit to expose the various options. Depending on the value, the polling code will behave differently: -1 Never enter hybrid sleep mode 0 Use half of the completion mean for the sleep delay >0 Use this specific value as the sleep delay Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37ed4ea705c8..85699bc90a51 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,7 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; - unsigned int poll_nsec; + int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit v1.2.3 From 9a05e7541c39680d28ecf91892338e074738d5fd Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Nov 2016 15:16:06 +0100 Subject: block: Change extern inline to static inline With compilers which follow the C99 standard (like modern versions of gcc and clang), "extern inline" does the opposite thing from older versions of gcc (emits code for an externally linkable version of the inline function). "static inline" does the intended behavior in all cases instead. Description taken from commit 6d91857d4826 ("staging, rtl8192e, LLVMLinux: Change extern inline to static inline"). This also fixes the following GCC warning when building with CONFIG_PM disabled: ./include/linux/blkdev.h:1143:20: warning: no previous prototype for 'blk_set_runtime_active' [-Wmissing-prototypes] Fixes: d07ab6d11477 ("block: Add blk_set_runtime_active()") Reviewed-by: Mika Westerberg Signed-off-by: Tobias Klauser Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 85699bc90a51..541fdd8787a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1174,7 +1174,7 @@ static inline int blk_pre_runtime_suspend(struct request_queue *q) static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} static inline void blk_pre_runtime_resume(struct request_queue *q) {} static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -extern inline void blk_set_runtime_active(struct request_queue *q) {} +static inline void blk_set_runtime_active(struct request_queue *q) {} #endif /* -- cgit v1.2.3 From e73c23ff736e1ea371dfa419d7bf8e77ee53044a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:58 -0800 Subject: block: add async variant of blkdev_issue_zeroout Similar to __blkdev_issue_discard this variant allows submitting the final bio asynchronously and chaining multiple ranges into a single completion. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 541fdd8787a5..7e9d8a0895be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1269,6 +1269,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); +extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, -- cgit v1.2.3 From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:59 -0800 Subject: block: add support for REQ_OP_WRITE_ZEROES This adds a new block layer operation to zero out a range of LBAs. This allows to implement zeroing for devices that don't use either discard with a predictable zero pattern or WRITE SAME of zeroes. The prominent example of that is NVMe with the Write Zeroes command, but in the future, this should also help with improving the way zeroing discards work. For this operation, suitable entry is exported in sysfs which indicate the number of maximum bytes allowed in one write zeroes operation by the device. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e9d8a0895be..ebeef2b79c5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,7 @@ struct queue_limits { unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; + unsigned int max_write_zeroes_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_FLUSH) return false; + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + return q->limits.max_sectors; } @@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); +extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_zeroes_sectors; + + return 0; +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Dec 2016 15:20:32 -0700 Subject: block: improve handling of the magic discard payload Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ebeef2b79c5a..c5393766909d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,10 +120,13 @@ typedef __u32 __bitwise req_flags_t; #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* IO stats tracking on */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) #define BLK_MAX_CDB 16 @@ -175,6 +178,7 @@ struct request { */ union { struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; void *completion_data; }; @@ -909,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); -extern void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -1153,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From e8465447d2f3366069115f7453153561ac9a1220 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Dec 2016 10:11:56 +0530 Subject: block: Remove unused member (busy) from struct blk_queue_tag Signed-off-by: Ritesh Harjani Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 286b2a264383..83695641bd5e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ enum blk_queue_state { struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ - int busy; /* current depth */ int max_depth; /* what we will send to device */ int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ -- cgit v1.2.3 From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 12 Jan 2017 07:58:32 -0700 Subject: block: Rename blk_queue_zone_size and bdev_zone_size All block device data fields and functions returning a number of 512B sectors are by convention named xxx_sectors while names in the form xxx_size are generally used for a number of bytes. The blk_queue_zone_size and bdev_zone_size functions were not following this convention so rename them. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Collapsed the two patches, they were nonsensically split and broke bisection. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..ff3d774f2751 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -739,7 +739,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_size(struct request_queue *q) +static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1536,12 +1536,12 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_size(struct block_device *bdev) +static inline unsigned int bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) - return blk_queue_zone_size(q); + return blk_queue_zone_sectors(q); return 0; } -- cgit v1.2.3 From 2e3258ecfaebace1ceffaa14e0ea94775d54f46f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:10 +0100 Subject: block: add blk_rq_payload_bytes Add a helper to calculate the actual data transfer size for special payload requests. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff3d774f2751..1ca8e8fd1078 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,19 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { -- cgit v1.2.3