From 9efc160f4bbd69b17b48edec53067537d04e62b7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 31 May 2017 14:43:46 -0700 Subject: block: Introduce queue flag QUEUE_FLAG_SCSI_PASSTHROUGH From the context where a SCSI command is submitted it is not always possible to figure out whether or not the queue the command is submitted to has struct scsi_request as the first member of its private data. Hence introduce the flag QUEUE_FLAG_SCSI_PASSTHROUGH. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Cc: Omar Sandoval Cc: Don Brace Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ab92c4ea138b..019f18c65098 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,6 +618,7 @@ struct request_queue { #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -708,6 +709,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) +#define blk_queue_scsi_passthrough(q) \ + test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ -- cgit v1.2.3 From 4055351cdbb44e8646ff67b346c80097e1d2c04c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:37:58 +0200 Subject: fs: remove the unused error argument to dio_end_io() Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..4388ab58843d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2843,7 +2843,7 @@ enum { DIO_SKIP_DIO_COUNT = 0x08, }; -void dio_end_io(struct bio *bio, int error); +void dio_end_io(struct bio *bio); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, -- cgit v1.2.3 From 1be5690984588953e759af0a4c6ddac182a1806c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:03 +0200 Subject: dm: change ->end_io calling convention Turn the error paramter into a pointer so that target drivers can change the value, and make sure only DM_ENDIO_* values are returned from the methods. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/device-mapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index f4c639c0c362..dec227acc13b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -72,7 +72,7 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); * 2 : The target wants to push back the io */ typedef int (*dm_endio_fn) (struct dm_target *ti, - struct bio *bio, int error); + struct bio *bio, int *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, struct request *clone, int error, union map_info *map_context); -- cgit v1.2.3 From 2a842acab109f40f0d7d10b38e9ca88390628996 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:04 +0200 Subject: block: introduce new block status code type Currently we use nornal Linux errno values in the block layer, and while we accept any error a few have overloaded magic meanings. This patch instead introduces a new blk_status_t value that holds block layer specific status codes and explicitly explains their meaning. Helpers to convert from and to the previous special meanings are provided for now, but I suspect we want to get rid of them in the long run - those drivers that have a errno input (e.g. networking) usually get errnos that don't know about the special block layer overloads, and similarly returning them to userspace will usually return somethings that strictly speaking isn't correct for file system operations, but that's left as an exercise for later. For now the set of errors is a very limited set that closely corresponds to the previous overloaded errno values, but there is some low hanging fruite to improve it. blk_status_t (ab)uses the sparse __bitwise annotations to allow for sparse typechecking, so that we can easily catch places passing the wrong values. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++-- include/linux/blk_types.h | 16 ++++++++++++++++ include/linux/blkdev.h | 21 ++++++++++++--------- include/linux/device-mapper.h | 2 +- include/linux/ide.h | 6 +++--- 5 files changed, 34 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fcd641032f8d..0cf6735046d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -230,8 +230,8 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); -void blk_mq_end_request(struct request *rq, int error); -void __blk_mq_end_request(struct request *rq, int error); +void blk_mq_end_request(struct request *rq, blk_status_t error); +void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 61339bc44400..59378939a8cd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,22 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +/* + * Block error status values. See block/blk-core:blk_errors for the details. + */ +typedef u8 __bitwise blk_status_t; +#define BLK_STS_OK 0 +#define BLK_STS_NOTSUPP ((__force blk_status_t)1) +#define BLK_STS_TIMEOUT ((__force blk_status_t)2) +#define BLK_STS_NOSPC ((__force blk_status_t)3) +#define BLK_STS_TRANSPORT ((__force blk_status_t)4) +#define BLK_STS_TARGET ((__force blk_status_t)5) +#define BLK_STS_NEXUS ((__force blk_status_t)6) +#define BLK_STS_MEDIUM ((__force blk_status_t)7) +#define BLK_STS_PROTECTION ((__force blk_status_t)8) +#define BLK_STS_RESOURCE ((__force blk_status_t)9) +#define BLK_STS_IOERR ((__force blk_status_t)10) + struct blk_issue_stat { u64 stat; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 019f18c65098..2a8871638453 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -55,7 +55,7 @@ struct blk_stat_callback; */ #define BLKCG_MAX_POLS 3 -typedef void (rq_end_io_fn)(struct request *, int); +typedef void (rq_end_io_fn)(struct request *, blk_status_t); #define BLK_RL_SYNCFULL (1U << 0) #define BLK_RL_ASYNCFULL (1U << 1) @@ -940,7 +940,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); extern void blk_rq_unprep_clone(struct request *rq); -extern int blk_insert_cloned_request(struct request_queue *q, +extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); @@ -980,6 +980,9 @@ extern void blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +int blk_status_to_errno(blk_status_t status); +blk_status_t errno_to_blk_status(int errno); + bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -1112,16 +1115,16 @@ extern struct request *blk_fetch_request(struct request_queue *q); * blk_end_request() for parts of the original function. * This prevents code duplication in drivers. */ -extern bool blk_update_request(struct request *rq, int error, +extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_finish_request(struct request *rq, int error); -extern bool blk_end_request(struct request *rq, int error, +extern void blk_finish_request(struct request *rq, blk_status_t error); +extern bool blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request(struct request *rq, int error, +extern void blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request_cur(struct request *rq, int error); +extern void __blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index dec227acc13b..5de5c53251ec 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -74,7 +74,7 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); typedef int (*dm_endio_fn) (struct dm_target *ti, struct bio *bio, int *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, - struct request *clone, int error, + struct request *clone, blk_status_t error, union map_info *map_context); typedef void (*dm_presuspend_fn) (struct dm_target *ti); diff --git a/include/linux/ide.h b/include/linux/ide.h index 6980ca322074..dc152e4b7f73 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -671,7 +671,7 @@ struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); void (*set_dma_mode)(struct hwif_s *, ide_drive_t *); - int (*reset_poll)(ide_drive_t *); + blk_status_t (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); void (*maskproc)(ide_drive_t *, int); @@ -1092,7 +1092,7 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l extern int ide_vlb_clk; extern int ide_pci_clk; -int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int); +int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int); void ide_kill_rq(ide_drive_t *, struct request *); void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); @@ -1123,7 +1123,7 @@ extern int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, int arg); void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8); -int ide_complete_rq(ide_drive_t *, int, unsigned int); +int ide_complete_rq(ide_drive_t *, blk_status_t, unsigned int); void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd); void ide_tf_dump(const char *, struct ide_cmd *); -- cgit v1.2.3 From fc17b6534eb8395f0b3133eb31d87deec32c642b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:05 +0200 Subject: blk-mq: switch ->queue_rq return value to blk_status_t Use the same values for use for request completion errors as the return value from ->queue_rq. BLK_STS_RESOURCE is special cased to cause a requeue, and all the others are completed as-is. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0cf6735046d3..b144b7b0e104 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -87,7 +87,8 @@ struct blk_mq_queue_data { bool last; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, + const struct blk_mq_queue_data *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -155,10 +156,6 @@ struct blk_mq_ops { }; enum { - BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ - BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ - BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, -- cgit v1.2.3 From 4e4cbee93d56137ebff722be022cae5f70ef84fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:06 +0200 Subject: block: switch bios to blk_status_t Replace bi_error with a new bi_status to allow for a clear conversion. Note that device mapper overloaded bi_error with a private value, which we'll have to keep arround at least for now and thus propagate to a proper blk_status_t value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blk_types.h | 5 ++++- include/linux/blkdev.h | 2 +- include/linux/device-mapper.h | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d1b04b0e99cf..9455aada1399 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -414,7 +414,7 @@ extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 59378939a8cd..dcd45b15a3a5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -33,6 +33,9 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_RESOURCE ((__force blk_status_t)9) #define BLK_STS_IOERR ((__force blk_status_t)10) +/* hack for device mapper, don't use elsewhere: */ +#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) + struct blk_issue_stat { u64 stat; }; @@ -44,7 +47,7 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - int bi_error; + blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a8871638453..76b6df862a12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1782,7 +1782,7 @@ struct blk_integrity_iter { const char *disk_name; }; -typedef int (integrity_processing_fn) (struct blk_integrity_iter *); +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); struct blk_integrity_profile { integrity_processing_fn *generate_fn; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 5de5c53251ec..456da5017b32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -72,7 +72,7 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); * 2 : The target wants to push back the io */ typedef int (*dm_endio_fn) (struct dm_target *ti, - struct bio *bio, int *error); + struct bio *bio, blk_status_t *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, struct request *clone, blk_status_t error, union map_info *map_context); -- cgit v1.2.3 From 39673e1995381b09a63cc7e9d0aea7cf871cb359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 15:36:28 +0100 Subject: nvme.h: add struct nvme_host_mem_buf_desc and HMB flags Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e400a69fa1d3..180a2fdbcaef 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -587,6 +587,11 @@ struct nvme_feat_auto_pst { __le64 entries[32]; }; +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -671,6 +676,12 @@ struct nvme_features { __u32 rsvd12[4]; }; +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; +}; + struct nvme_create_cq { __u8 opcode; __u8 flags; -- cgit v1.2.3 From b85cf7348ab50e2042b732e19031b1d22eedc741 Mon Sep 17 00:00:00 2001 From: Arnav Dawn Date: Fri, 12 May 2017 17:12:03 +0200 Subject: nvme.h: add dword 12 - 15 fields to struct nvme_features Signed-off-by: Arnav Dawn [hch: split from a larger patch, new changelog] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 180a2fdbcaef..51ca4771be2c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -673,7 +673,10 @@ struct nvme_features { union nvme_data_ptr dptr; __le32 fid; __le32 dword11; - __u32 rsvd12[4]; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; }; struct nvme_host_mem_buf_desc { -- cgit v1.2.3 From 97f6ef6464dbd235a4d9bdfc05d949aab24fc927 Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Wed, 24 May 2017 16:39:55 +0800 Subject: nvme-pci: remap BAR0 to cover admin CQ doorbell for large stride The existing driver initially maps 8192 bytes of BAR0 which is intended to cover doorbells of admin SQ and CQ. However, if a large stride, e.g. 10, is used, the doorbell of admin CQ will be out of 8192 bytes. Consequently, a page fault will be raised when the admin CQ doorbell is accessed in nvme_configure_admin_queue(). This patch fixes this issue by remapping BAR0 before accessing admin CQ doorbell if the initial mapping is not enough. Signed-off-by: Xu Yu Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 51ca4771be2c..706a0fbfe28e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -102,6 +102,7 @@ enum { NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) -- cgit v1.2.3 From 0945e56994ac855d01c4aecf69bded65c751b894 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 7 Jun 2017 11:45:28 +0200 Subject: scatterlist: add sg_zero_buffer() helper The sg_zero_buffer() helper is used to zero fill an area in a SG list. Signed-off-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg [hch: renamed to sg_zero_buffer] Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index cb3c8fe6acd7..4b3286ac60c8 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -278,6 +278,8 @@ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); +size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, + size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if -- cgit v1.2.3 From 0add5e8e588c65c5ac6a3255f624260bf889128d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 7 Jun 2017 11:45:29 +0200 Subject: nvmet: use NVME_IDENTIFY_DATA_SIZE Use NVME_IDENTIFY_DATA_SIZE define instead of hard coding the magic 4096 value. Signed-off-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke [hch: converted three more users] Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 706a0fbfe28e..782d557c5535 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -665,6 +665,8 @@ struct nvme_identify { __u32 rsvd11[5]; }; +#define NVME_IDENTIFY_DATA_SIZE 4096 + struct nvme_features { __u8 opcode; __u8 flags; -- cgit v1.2.3 From af8b86e9a7ffb9528e745b7ea25b18545699482c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 7 Jun 2017 11:45:30 +0200 Subject: nvme: introduce NVMe Namespace Identification Descriptor structures Signed-off-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 782d557c5535..f2344aa923e8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -290,6 +290,7 @@ enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -316,6 +317,22 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; -- cgit v1.2.3 From c61d788b8b1fe57aaf03ac0b5c636c7388ebfd20 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 7 Jun 2017 11:45:36 +0200 Subject: nvmet: allow overriding the NVMe VS via configfs Allow overriding the announced NVMe Version of a via configfs. This is particularly helpful when debugging new features for the host or target side without bumping the hard coded version (as the target might not be fully compliant to the announced version yet). Signed-off-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f2344aa923e8..acb484935603 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1085,4 +1085,8 @@ struct nvme_completion { #define NVME_VS(major, minor, tertiary) \ (((major) << 16) | ((minor) << 8) | (tertiary)) +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From 435e809058bafaa8f0bf8f55f37508b01734c9a5 Mon Sep 17 00:00:00 2001 From: Guan Junxiong Date: Tue, 13 Jun 2017 09:26:15 +0800 Subject: nvme: add fields into identify controller data structure Add the new to NVMe 1.3 fields EDSTT, DSTO, FWUG, HCTMA, MNTMT, MXTMT, and SANICAP into the idenfity controller data structure. Signed-off-by: Guan Junxiong Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index acb484935603..6d476f242ee6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -209,9 +209,15 @@ struct nvme_id_ctrl { __u8 tnvmcap[16]; __u8 unvmcap[16]; __le32 rpmbs; - __u8 rsvd316[4]; + __le16 edstt; + __u8 dsto; + __u8 fwug; __le16 kas; - __u8 rsvd322[190]; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __u8 rsvd332[180]; __u8 sqes; __u8 cqes; __le16 maxcmd; -- cgit v1.2.3 From 6b8190d61a622e095f04451437953acd2d74b371 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Thu, 15 Jun 2017 10:44:30 -0600 Subject: nvme: implement NS Optimal IO Boundary from 1.3 Spec The NVMe 1.3 spec introduces Namespace Optimal IO Boundaries (NOIOB), which standardizes the stripe mechanism we currently have quirks for. This patch implements the necessary logic to handle this new feature. Signed-off-by: Scott Bauer Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 6d476f242ee6..291587a0743f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -282,7 +282,7 @@ struct nvme_id_ns { __le16 nabsn; __le16 nabo; __le16 nabspf; - __u16 rsvd46; + __le16 noiob; __u8 nvmcap[16]; __u8 rsvd64[40]; __u8 nguid[16]; -- cgit v1.2.3 From 7b9e93616399638521aafd1f01dfcf474c736393 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Jun 2017 18:15:21 +0200 Subject: blk-mq-sched: unify request finished methods No need to have two different callouts of bfq vs kyber. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/elevator.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 0e306c5a86d6..4acea351d43f 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -105,7 +105,7 @@ struct elevator_mq_ops { void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); - void (*put_request)(struct request *); + void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); @@ -115,7 +115,6 @@ struct elevator_mq_ops { struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *); - void (*put_rq_priv)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; -- cgit v1.2.3 From 5bbf4e5a8e3a780874b2ed77bd1bd57850f3f6da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Jun 2017 18:15:26 +0200 Subject: blk-mq-sched: unify request prepare methods This patch makes sure we always allocate requests in the core blk-mq code and use a common prepare_request method to initialize them for both mq I/O schedulers. For Kyber and additional limit_depth method is added that is called before allocating the request. Also because none of the intializations can really fail the new method does not return an error - instead the bfq finish method is hardened to deal with the no-IOC case. Last but not least this removes the abuse of RQF_QUEUE by the blk-mq scheduling code as RQF_ELFPRIV is all that is needed now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/elevator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4acea351d43f..5bc8f8682a3e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -104,7 +104,8 @@ struct elevator_mq_ops { int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); - struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); + void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); + void (*prepare_request)(struct request *, struct bio *bio); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); @@ -114,7 +115,6 @@ struct elevator_mq_ops { void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); - int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; -- cgit v1.2.3 From af67c31fba3b879b241536a48df703a2eee18ebf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 18 Jun 2017 14:38:57 +1000 Subject: blk: remove bio_set arg from blk_queue_split() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit blk_queue_split() is always called with the last arg being q->bio_split, where 'q' is the first arg. Also blk_queue_split() sometimes uses the passed-in 'bs' and sometimes uses q->bio_split. This is inconsistent and unnecessary. Remove the last arg and always use q->bio_split inside blk_queue_split() Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Credit-to: Javier González (Noticed that lightnvm was missed) Reviewed-by: Javier González Tested-by: Javier González Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76b6df862a12..670df402bc51 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -944,8 +944,7 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); -extern void blk_queue_split(struct request_queue *, struct bio **, - struct bio_set *); +extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, -- cgit v1.2.3 From 011067b05668b05aae88e5a24cff0ca0a67ca0b0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 18 Jun 2017 14:38:57 +1000 Subject: blk: replace bioset_create_nobvec() with a flags arg to bioset_create() "flags" arguments are often seen as good API design as they allow easy extensibility. bioset_create_nobvec() is implemented internally as a variation in flags passed to __bioset_create(). To support future extension, make the internal structure part of the API. i.e. add a 'flags' argument to bioset_create() and discard bioset_create_nobvec(). Note that the bio_split allocations in drivers/md/raid* do not need the bvec mempool - they should have used bioset_create_nobvec(). Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9455aada1399..985dc645637e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -373,8 +373,10 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, return bio_split(bio, sectors, gfp, bs); } -extern struct bio_set *bioset_create(unsigned int, unsigned int); -extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); +extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags); +enum { + BIOSET_NEED_BVECS = BIT(0), +}; extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -- cgit v1.2.3 From 47e0fb461fca1a68a566c82fcc006cc787312d8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 18 Jun 2017 14:38:57 +1000 Subject: blk: make the bioset rescue_workqueue optional. This patch converts bioset_create() to not create a workqueue by default, so alloctions will never trigger punt_bios_to_rescuer(). It also introduces a new flag BIOSET_NEED_RESCUER which tells bioset_create() to preserve the old behavior. All callers of bioset_create() that are inside block device drivers, are given the BIOSET_NEED_RESCUER flag. biosets used by filesystems or other top-level users do not need rescuing as the bio can never be queued behind other bios. This includes fs_bio_set, blkdev_dio_pool, btrfs_bioset, xfs_ioend_bioset, and one allocated by target_core_iblock.c. biosets used by md/raid do not need rescuing as their usage was recently audited and revised to never risk deadlock. It is hoped that most, if not all, of the remaining biosets can end up being the non-rescued version. Reviewed-by: Christoph Hellwig Credit-to: Ming Lei (minor fixes) Reviewed-by: Ming Lei Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 985dc645637e..32c786baa10a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -376,6 +376,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags); enum { BIOSET_NEED_BVECS = BIT(0), + BIOSET_NEED_RESCUER = BIT(1), }; extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -- cgit v1.2.3 From 9b10f6a9c2aaab49c56b8cff0facdc1b64ed7e1c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 18 Jun 2017 14:38:59 +1000 Subject: block: remove bio_clone() and all references. bio_clone() is no longer used. Only bio_clone_bioset() or bio_clone_fast(). This is for the best, as bio_clone() used fs_bio_set, and filesystems are unlikely to want to use bio_clone(). So remove bio_clone() and all references. This includes a fix to some incorrect documentation. Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 32c786baa10a..40d054185277 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -395,11 +395,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); } -static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, fs_bio_set); -} - static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -- cgit v1.2.3 From 97e0120990f4a7037f72c0e115e5c7f514025738 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 6 Jun 2017 23:22:01 +0800 Subject: blk-mq: move blk_mq_quiesce_queue() into include/linux/blk-mq.h We usually put blk_mq_*() into include/linux/blk-mq.h, so move this API into there. Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b144b7b0e104..99348adb3e16 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -244,6 +244,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 670df402bc51..8423f6baf818 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -965,7 +965,6 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); -extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); -- cgit v1.2.3 From 4f084b41a0c04a69067be98a210e6b50969f9945 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 6 Jun 2017 23:22:02 +0800 Subject: blk-mq: introduce blk_mq_quiesce_queue_nowait() This patch introduces blk_mq_quiesce_queue_nowait() so that we can workaround mpt3sas for quiescing its queue. Once mpt3sas is fixed, we can remove this helper. Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 99348adb3e16..78a8b64074ea 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -262,6 +262,14 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); +/* + * FIXME: this helper is just for working around mpt3sas. + */ +static inline void blk_mq_quiesce_queue_nowait(struct request_queue *q) +{ + blk_mq_stop_hw_queues(q); +} + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. -- cgit v1.2.3 From e4e739131ac93d373cd2d2fd92820a6a39115ba5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 6 Jun 2017 23:22:03 +0800 Subject: blk-mq: introduce blk_mq_unquiesce_queue blk_mq_start_stopped_hw_queues() is used implictly as counterpart of blk_mq_quiesce_queue() for unquiescing queue, so we introduce blk_mq_unquiesce_queue() and make it as counterpart of blk_mq_quiesce_queue() explicitly. This function is for improving the current quiescing mechanism in the following patches. Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 78a8b64074ea..787d8a2a2ac6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -245,6 +245,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); +void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); -- cgit v1.2.3 From f4560ffe8cec1361b1021d81aca6a4173f8e7c87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 18 Jun 2017 14:24:27 -0600 Subject: blk-mq: use QUEUE_FLAG_QUIESCED to quiesce queue It is required that no dispatch can happen any more once blk_mq_quiesce_queue() returns, and we don't have such requirement on APIs of stopping queue. But blk_mq_quiesce_queue() still may not block/drain dispatch in the the case of BLK_MQ_S_START_ON_RUN, so use the new introduced flag of QUEUE_FLAG_QUIESCED and evaluate it inside RCU read-side critical sections for fixing this issue. Also blk_mq_quiesce_queue() is implemented via stopping queue, which limits its uses, and easy to cause race, because any queue restart in other paths may break blk_mq_quiesce_queue(). With the introduced flag of QUEUE_FLAG_QUIESCED, we don't need to depend on stopping queue for quiescing any more. Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 787d8a2a2ac6..de6536c14ae7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -268,6 +268,10 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); */ static inline void blk_mq_quiesce_queue_nowait(struct request_queue *q) { + spin_lock_irq(q->queue_lock); + queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irq(q->queue_lock); + blk_mq_stop_hw_queues(q); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8423f6baf818..22cfba64ce81 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -619,6 +619,7 @@ struct request_queue { #define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -715,6 +716,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) +#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) static inline bool blk_account_rq(struct request *rq) { -- cgit v1.2.3 From 1d9e9bc6b56e1bb7e33e7e2e1b99d7088356c006 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 6 Jun 2017 23:22:08 +0800 Subject: blk-mq: don't stop queue for quiescing Queue can be started by other blk-mq APIs and can be used in different cases, this limits uses of blk_mq_quiesce_queue() if it is based on stopping queue, and make its usage very difficult, especially users have to use the stop queue APIs carefully for avoiding to break blk_mq_quiesce_queue(). We have applied the QUIESCED flag for draining and blocking dispatch, so it isn't necessary to stop queue any more. After stopping queue is removed, blk_mq_quiesce_queue() can be used safely and easily, then users won't worry about queue restarting during quiescing at all. Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index de6536c14ae7..f1bd13ae8f57 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -271,8 +271,6 @@ static inline void blk_mq_quiesce_queue_nowait(struct request_queue *q) spin_lock_irq(q->queue_lock); queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irq(q->queue_lock); - - blk_mq_stop_hw_queues(q); } /* -- cgit v1.2.3 From fdd2f5b7de2afaa931e5f7bad7bcda35d1f1b479 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 20 Jun 2017 07:05:40 -0500 Subject: fs: Separate out kiocb flags setup based on RWF_* flags Also added RWF_SUPPORTED to encompass all flags. Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- include/linux/fs.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 023f0324762b..96a1a1fa54a9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3057,6 +3057,20 @@ static inline int iocb_flags(struct file *file) return res; } +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags) +{ + if (unlikely(flags & ~RWF_SUPPORTED)) + return -EOPNOTSUPP; + + if (flags & RWF_HIPRI) + ki->ki_flags |= IOCB_HIPRI; + if (flags & RWF_DSYNC) + ki->ki_flags |= IOCB_DSYNC; + if (flags & RWF_SYNC) + ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + return 0; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; -- cgit v1.2.3 From 7fc9e4722435cd8459182c4975f48934f2bb1274 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 20 Jun 2017 07:05:41 -0500 Subject: fs: Introduce filemap_range_has_page() filemap_range_has_page() return true if the file's mapping has a page within the range mentioned. This function will be used to check if a write() call will cause a writeback of previous writes. Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 96a1a1fa54a9..0d34f5b5a6b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2518,6 +2518,8 @@ extern int filemap_fdatawait(struct address_space *); extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_has_page(struct address_space *, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -- cgit v1.2.3 From b745fafaf70c0a98a2e1e7ac8cb14542889ceb0e Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 20 Jun 2017 07:05:43 -0500 Subject: fs: Introduce RWF_NOWAIT and FMODE_AIO_NOWAIT RWF_NOWAIT informs kernel to bail out if an AIO request will block for reasons such as file allocations, or a writeback triggered, or would block while allocating requests while performing direct I/O. RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags. FMODE_AIO_NOWAIT is a flag which identifies the file opened is capable of returning -EAGAIN if the AIO call will block. This must be set by supporting filesystems in the ->open() call. Filesystems xfs, btrfs and ext4 would be supported in the following patches. Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0d34f5b5a6b0..4574121f4746 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -143,6 +143,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* File is capable of returning -EAGAIN if AIO will block */ +#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are @@ -269,6 +272,7 @@ struct writeback_control; #define IOCB_DSYNC (1 << 4) #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) +#define IOCB_NOWAIT (1 << 7) struct kiocb { struct file *ki_filp; @@ -3064,6 +3068,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags) if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (flags & RWF_NOWAIT) { + if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT)) + return -EOPNOTSUPP; + ki->ki_flags |= IOCB_NOWAIT; + } if (flags & RWF_HIPRI) ki->ki_flags |= IOCB_HIPRI; if (flags & RWF_DSYNC) -- cgit v1.2.3 From a38d1243704f501a4c42de1db1062ff6eba83453 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 20 Jun 2017 07:05:45 -0500 Subject: fs: Introduce IOMAP_NOWAIT IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps. This is used by XFS in the XFS patch. Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f753e788da31..69f4e9470084 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -52,6 +52,7 @@ struct iomap { #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */ struct iomap_ops { /* -- cgit v1.2.3 From 03a07c92a9ed9938d828ca7f1d11b8bc63a7bb89 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 20 Jun 2017 07:05:46 -0500 Subject: block: return on congested block device A new bio operation flag REQ_NOWAIT is introduced to identify bio's orignating from iocb with IOCB_NOWAIT. This flag indicates to return immediately if a request cannot be made instead of retrying. Stacked devices such as md (the ones with make_request_fn hooks) currently are not supported because it may block for housekeeping. For example, an md can have a part of the device suspended. For this reason, only request based devices are supported. In the future, this feature will be expanded to stacked devices by teaching them how to handle the REQ_NOWAIT flags. Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Goldwyn Rodrigues Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++++ include/linux/blk_types.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 40d054185277..36aa641cde28 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -416,6 +416,12 @@ static inline void bio_io_error(struct bio *bio) bio_endio(bio); } +static inline void bio_wouldblock_error(struct bio *bio) +{ + bio->bi_status = BLK_STS_AGAIN; + bio_endio(bio); +} + struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dcd45b15a3a5..e210da6d14b8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -36,6 +36,8 @@ typedef u8 __bitwise blk_status_t; /* hack for device mapper, don't use elsewhere: */ #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) +#define BLK_STS_AGAIN ((__force blk_status_t)12) + struct blk_issue_stat { u64 stat; }; @@ -224,6 +226,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -242,6 +245,7 @@ enum req_flag_bits { #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From 80ab6af432523b33352771b1eca1cee793cc7c81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jun 2017 09:24:40 +0200 Subject: block: remove the unused bio_to_phys macro Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 36aa641cde28..4907bea03908 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -118,7 +118,6 @@ static inline void *bio_data(struct bio *bio) /* * will die */ -#define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) /* -- cgit v1.2.3 From efbeccdb59d666b9c77d505af01097cc0a9d102b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jun 2017 09:24:41 +0200 Subject: block: stop using bio_data() in blk_write_same_mergeable While the Write Same page currently always is in low-level it is just as easy and safer to just compare the page and offset directly. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22cfba64ce81..0deed7274a7f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -815,7 +815,8 @@ static inline bool rq_mergeable(struct request *rq) static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { - if (bio_data(a) == bio_data(b)) + if (bio_page(a) == bio_page(b) && + bio_offset(a) == bio_offset(b)) return true; return false; -- cgit v1.2.3 From 073196787727e454e17a96d222ea55eba2000978 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Jun 2017 11:15:38 -0700 Subject: blk-mq: Reduce blk_mq_hw_ctx size Since the srcu structure is rather large (184 bytes on an x86-64 system with kernel debugging disabled), only allocate it if needed. Reported-by: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Cc: Hannes Reinecke Cc: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f1bd13ae8f57..3f2c22a42df6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,8 +39,6 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; struct blk_mq_tags *sched_tags; - struct srcu_struct queue_rq_srcu; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 @@ -62,6 +60,9 @@ struct blk_mq_hw_ctx { struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; #endif + + /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + struct srcu_struct queue_rq_srcu[0]; }; struct blk_mq_tag_set { -- cgit v1.2.3 From cd6ce1482fd9e691bb68c660fa918c90f6b1bc25 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Jun 2017 11:15:39 -0700 Subject: block: Make request operation type argument declarations consistent Instead of declaring the second argument of blk_*_get_request() as int and passing it to functions that expect an unsigned int, declare that second argument as unsigned int. Also because of consistency, rename that second argument from 'rw' into 'op'. This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Cc: Hannes Reinecke Cc: Omar Sandoval Cc: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 6 +++--- include/linux/blkdev.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3f2c22a42df6..3077714250ce 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -202,10 +202,10 @@ enum { BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, unsigned int flags); -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, - unsigned int flags, unsigned int hctx_idx); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0deed7274a7f..e21dd893ee86 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -935,7 +935,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern struct request *blk_get_request(struct request_queue *, int, gfp_t); +extern struct request *blk_get_request(struct request_queue *, unsigned int op, + gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, -- cgit v1.2.3 From d280bab305431c1836423f3cd6a5ff0e35a601ef Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Jun 2017 11:15:40 -0700 Subject: block: Introduce request_queue.initialize_rq_fn() Several block drivers need to initialize the driver-private request data after having called blk_get_request() and before .prep_rq_fn() is called, e.g. when submitting a REQ_OP_SCSI_* request. Avoid that that initialization code has to be repeated after every blk_get_request() call by adding new callback functions to struct request_queue and to struct blk_mq_ops. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3077714250ce..366b83cee955 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -144,6 +144,8 @@ struct blk_mq_ops { init_request_fn *init_request; exit_request_fn *exit_request; reinit_request_fn *reinit_request; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); map_queues_fn *map_queues; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e21dd893ee86..9a36164487d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -410,8 +410,12 @@ struct request_queue { rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; + /* Called just after a request is allocated */ init_rq_fn *init_rq_fn; + /* Called just before a request is freed */ exit_rq_fn *exit_rq_fn; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); const struct blk_mq_ops *mq_ops; -- cgit v1.2.3 From 9e0c829906b9aa1e7ad84689f2bcd56457bdb417 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Jun 2017 11:15:44 -0700 Subject: block: Add a comment above queue_lockdep_assert_held() Add a comment above the queue_lockdep_assert_held() macro that explains the purpose of the q->queue_lock test. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Cc: Hannes Reinecke Cc: Omar Sandoval Cc: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9a36164487d0..3e60e7a654bd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -635,6 +635,13 @@ struct request_queue { (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) +/* + * @q->queue_lock is set while a queue is being initialized. Since we know + * that no other threads access the queue object before @q->queue_lock has + * been set, it is safe to manipulate queue flags without holding the + * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and + * blk_init_allocated_queue(). + */ static inline void queue_lockdep_assert_held(struct request_queue *q) { if (q->queue_lock) -- cgit v1.2.3 From 852ec80983d682dc08a0573d37eeaa9814c4f6b1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 21 Jun 2017 10:55:47 -0700 Subject: blk-mq: Make it safe to quiesce and unquiesce from an interrupt handler Since blk_mq_quiesce_queue_nowait() can be called from interrupt context, make this safe. Since this function is not in the hot path, uninline it. Fixes: commit f4560ffe8cec ("blk-mq: use QUEUE_FLAG_QUIESCED to quiesce queue") Signed-off-by: Bart Van Assche Cc: Ming Lei Cc: Hannes Reinecke Cc: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 366b83cee955..23d32ff0b462 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -266,15 +266,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -/* - * FIXME: this helper is just for working around mpt3sas. - */ -static inline void blk_mq_quiesce_queue_nowait(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_QUIESCED, q); - spin_unlock_irq(q->queue_lock); -} +void blk_mq_quiesce_queue_nowait(struct request_queue *q); /* * Driver command data is immediately after the request. So subtract request -- cgit v1.2.3 From c75b1d9421f80f4143e389d2d50ddfc8a28c8c35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 Jun 2017 11:47:04 -0600 Subject: fs: add fcntl() interface for setting/getting write life time hints Define a set of write life time hints: RWH_WRITE_LIFE_NOT_SET No hint information set RWH_WRITE_LIFE_NONE No hints about write life time RWH_WRITE_LIFE_SHORT Data written has a short life time RWH_WRITE_LIFE_MEDIUM Data written has a medium life time RWH_WRITE_LIFE_LONG Data written has a long life time RWH_WRITE_LIFE_EXTREME Data written has an extremely long life time The intent is for these values to be relative to each other, no absolute meaning should be attached to these flag names. Add an fcntl interface for querying these flags, and also for setting them as well: F_GET_RW_HINT Returns the read/write hint set on the underlying inode. F_SET_RW_HINT Set one of the above write hints on the underlying inode. F_GET_FILE_RW_HINT Returns the read/write hint set on the file descriptor. F_SET_FILE_RW_HINT Set one of the above write hints on the file descriptor. The user passes in a 64-bit pointer to get/set these values, and the interface returns 0/-1 on success/error. Sample program testing/implementing basic setting/getting of write hints is below. Add support for storing the write life time hint in the inode flags and in struct file as well, and pass them to the kiocb flags. If both a file and its corresponding inode has a write hint, then we use the one in the file, if available. The file hint can be used for sync/direct IO, for buffered writeback only the inode hint is available. This is in preparation for utilizing these hints in the block layer, to guide on-media data placement. /* * writehint.c: get or set an inode write hint */ #include #include #include #include #include #include #ifndef F_GET_RW_HINT #define F_LINUX_SPECIFIC_BASE 1024 #define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) #endif static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE", "RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM", "RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" }; int main(int argc, char *argv[]) { uint64_t hint; int fd, ret; if (argc < 2) { fprintf(stderr, "%s: file \n", argv[0]); return 1; } fd = open(argv[1], O_RDONLY); if (fd < 0) { perror("open"); return 2; } if (argc > 2) { hint = atoi(argv[2]); ret = fcntl(fd, F_SET_RW_HINT, &hint); if (ret < 0) { perror("fcntl: F_SET_RW_HINT"); return 4; } } ret = fcntl(fd, F_GET_RW_HINT, &hint); if (ret < 0) { perror("fcntl: F_GET_RW_HINT"); return 3; } printf("%s: hint %s\n", argv[1], str[hint]); close(fd); return 0; } Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/fs.h | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4574121f4746..65adbddb3163 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -280,6 +293,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -287,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_complete == NULL; } -static inline int iocb_flags(struct file *file); - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_filp = filp, - .ki_flags = iocb_flags(filp), - }; -} - /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet @@ -597,6 +601,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -851,6 +856,7 @@ struct file { * Must not be taken from IRQ context. */ spinlock_t f_lock; + enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -1026,8 +1032,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include - extern void send_sigio(struct fown_struct *fown, int fd, int band); /* @@ -1878,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode) return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); } +static inline enum rw_hint file_write_hint(struct file *file) +{ + if (file->f_write_hint != WRITE_LIFE_NOT_SET) + return file->f_write_hint; + + return file_inode(file)->i_write_hint; +} + +static inline int iocb_flags(struct file *file); + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + .ki_flags = iocb_flags(filp), + .ki_hint = file_write_hint(filp), + }; +} + /* * Inode state bits. Protected by inode->i_lock * -- cgit v1.2.3 From cb6934f8ea1a595902ca37e250e0917d4dd7b2a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 Jun 2017 09:22:02 -0600 Subject: block: add support for write hints in a bio No functional changes in this patch, we just use up some holes in the bio and request structures to define a write hint that we psas down the stack. Ensure that we don't merge requests that have different life time hints assigned to them, and that we inherit the write hint when cloning a bio. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e210da6d14b8..d2eb87c84d82 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -56,6 +56,7 @@ struct bio { */ unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; + unsigned short bi_write_hint; struct bvec_iter bi_iter; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bf2157141d53..0eebd3bcfd85 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -225,6 +225,8 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ + unsigned short write_hint; + unsigned long deadline; struct list_head timeout_list; -- cgit v1.2.3 From f793dfd3f39a3dc50468b06498606b3a906f42f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Jun 2017 08:15:27 -0600 Subject: blk-mq: expose write hints through debugfs Useful to verify that things are working the way they should. Reading the file will return number of kb written with each write hint. Writing the file will reset the statistics. No care is taken to ensure that we don't race on updates. Drivers will write to q->write_hints[] if they handle a given write hint. Reviewed-by: Andreas Dilger Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0eebd3bcfd85..e1e289ab66b9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -596,6 +596,9 @@ struct request_queue { void *rq_alloc_data; struct work_struct release_work; + +#define BLK_MAX_WRITE_HINTS 5 + u64 write_hints[BLK_MAX_WRITE_HINTS]; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From f5d118406247acfc4fc481e441e01ea4d6318fdc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 Jun 2017 12:03:06 -0600 Subject: nvme: add support for streams and directives This adds support for Directives in NVMe, particular for the Streams directive. Support for Directives is a new feature in NVMe 1.3. It allows a user to pass in information about where to store the data, so that it the device can do so most effiently. If an application is managing and writing data with different life times, mixing differently retentioned data onto the same locations on flash can cause write amplification to grow. This, in turn, will reduce performance and life time of the device. Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/nvme.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 291587a0743f..f516a975bb21 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -253,6 +253,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, }; @@ -303,6 +304,19 @@ enum { NVME_ID_CNS_CTRL_LIST = 0x13, }; +enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, @@ -560,6 +574,7 @@ enum { NVME_RW_PRINFO_PRCHK_APP = 1 << 11, NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, }; struct nvme_dsm_cmd { @@ -634,6 +649,8 @@ enum nvme_admin_opcode { nvme_admin_download_fw = 0x11, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -797,6 +814,24 @@ struct nvme_get_log_page_command { __u32 rsvd14[2]; }; +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + /* * Fabrics subcommands. */ @@ -927,6 +962,18 @@ struct nvme_dbbuf { __u32 rsvd12[6]; }; +struct streams_directive_params { + __u16 msl; + __u16 nssa; + __u16 nsso; + __u8 rsvd[10]; + __u32 sws; + __u16 sgs; + __u16 nsa; + __u16 nso; + __u8 rsvd2[6]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -947,6 +994,7 @@ struct nvme_command { struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; }; }; -- cgit v1.2.3 From 3bce016a4c5975e4279bfb3cbd6d0332b856cc72 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jun 2017 09:26:21 +0200 Subject: block: move bounce declarations to block/blk.h Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e1e289ab66b9..e7eef48c97c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -884,19 +884,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) -{ - return 0; -} -static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) -{ -} -#endif /* CONFIG_MMU */ - struct rq_map_data { struct page **pages; int page_order; -- cgit v1.2.3 From 1c4bc3ab9a064d98cdf6de6b44f89d5c3757fa32 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jun 2017 09:26:22 +0200 Subject: block: remove the queue_bounce_pfn helper Only used inside the bounce code, and opencoding it makes it more obvious what is going on. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e7eef48c97c9..25f6a0cb27d3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1385,11 +1385,6 @@ enum blk_default_limits { #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) -static inline unsigned long queue_bounce_pfn(struct request_queue *q) -{ - return q->limits.bounce_pfn; -} - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; -- cgit v1.2.3 From 7aa1f42752f0d31a5bb6d0d5bac92fc8c2044ce2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 18 Jun 2017 16:15:59 +0300 Subject: nvme: use a single NVME_AQ_DEPTH and relax it to 32 No need to differentiate fabrics from pci/loop, also lower it to 32 as we don't really need 256 inflight admin commands. Signed-off-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f516a975bb21..6b8ee9e628e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,7 +87,7 @@ enum { NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ }; -#define NVMF_AQ_DEPTH 32 +#define NVME_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ -- cgit v1.2.3