From 729204ef49ec00b788ce23deb9eb922a5769f55d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 17 Dec 2016 18:49:09 +0800 Subject: block: relax check on sg gap If the last bvec of the 1st bio and the 1st bvec of the next bio are physically contigious, and the latter can be merged to last segment of the 1st bio, we should think they don't violate sg gap(or virt boundary) limit. Both Vitaly and Dexuan reported lots of unmergeable small bios are observed when running mkfs on Hyper-V virtual storage, and performance becomes quite low. This patch fixes that performance issue. The same issue should exist on NVMe, since it sets virt boundary too. Reported-by: Vitaly Kuznetsov Reported-by: Dexuan Cui Tested-by: Dexuan Cui Cc: Keith Busch Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..b20da8dfa7ec 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1607,6 +1607,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +/* + * Check if the two bvecs from two bios can be merged to one segment. + * If yes, no need to check gap between the two bios since the 1st bio + * and the 1st bvec in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + return false; + if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { @@ -1616,7 +1635,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, bio_get_last_bvec(prev, &pb); bio_get_first_bvec(next, &nb); - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + if (!bios_segs_mergeable(q, prev, &pb, &nb)) + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); } return false; -- cgit v1.2.3 From f8a5b12247fe18f7fed801ad262a7ab190e1f848 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Dec 2016 09:24:51 -0700 Subject: blk-mq: make mq_ops a const pointer We never change it, make that clear. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche --- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4a2ab5d99ff7..afc81d77e471 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -60,7 +60,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { unsigned int *mq_map; - struct blk_mq_ops *ops; + const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b20da8dfa7ec..2e99d659b0f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -407,7 +407,7 @@ struct request_queue { dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; - struct blk_mq_ops *mq_ops; + const struct blk_mq_ops *mq_ops; unsigned int *mq_map; -- cgit v1.2.3 From c51ca6cf545bc51ad38bd50816bde37c647d608d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 10 Dec 2016 15:13:59 -0700 Subject: block: move existing elevator ops to union Prep patch for adding MQ ops as well, since doing anon unions with named initializers doesn't work on older compilers. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/elevator.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b276e9ef0e0b..2a9e966eed03 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -94,7 +94,9 @@ struct elevator_type struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ - struct elevator_ops ops; + union { + struct elevator_ops sq; + } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; -- cgit v1.2.3 From 16a3c2a70cad5ccdc2dc0a4544bff82554807493 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Dec 2016 14:27:46 -0700 Subject: blk-mq: un-export blk_mq_free_hctx_request() It's only used in blk-mq, kill it from the main exported header and kill the symbol export as well. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afc81d77e471..2686f9e7302a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -181,7 +181,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { -- cgit v1.2.3 From fd2d332677c687ca90c12a47d6c377c547100b56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Jan 2017 10:04:45 -0700 Subject: blk-mq: add support for carrying internal tag information in blk_qc_t No functional change in this patch, just in preparation for having two types of tags available to the block layer for a single request. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- include/linux/blk_types.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 519ea2c9df61..0e5b1cd5113c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,22 +232,29 @@ static inline bool op_is_sync(unsigned int op) } typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_NONE -1U +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { return cookie != BLK_QC_T_NONE; } -static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num) +static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num, + bool internal) { - return tag | (queue_num << BLK_QC_T_SHIFT); + blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT); + + if (internal) + ret |= BLK_QC_T_INTERNAL; + + return ret; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) { - return cookie >> BLK_QC_T_SHIFT; + return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; } static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) @@ -255,6 +262,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +static inline bool blk_qc_t_is_internal(blk_qc_t cookie) +{ + return (cookie & BLK_QC_T_INTERNAL) != 0; +} + struct blk_issue_stat { u64 time; }; -- cgit v1.2.3 From bd166ef183c263c5ced656d49ef19c7da4adc774 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jan 2017 06:03:22 -0700 Subject: blk-mq-sched: add framework for MQ capable IO schedulers This adds a set of hooks that intercepts the blk-mq path of allocating/inserting/issuing/completing requests, allowing us to develop a scheduler within that framework. We reuse the existing elevator scheduler API on the registration side, but augment that with the scheduler flagging support for the blk-mq interfce, and with a separate set of ops hooks for MQ devices. We split driver and scheduler tags, so we can run the scheduling independently of device queue depth. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 5 ++++- include/linux/blkdev.h | 4 +++- include/linux/elevator.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2686f9e7302a..63569eb46d15 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,6 +22,7 @@ struct blk_mq_hw_ctx { unsigned long flags; /* BLK_MQ_F_* flags */ + void *sched_data; struct request_queue *queue; struct blk_flush_queue *fq; @@ -35,6 +36,7 @@ struct blk_mq_hw_ctx { atomic_t wait_index; struct blk_mq_tags *tags; + struct blk_mq_tags *sched_tags; struct srcu_struct queue_rq_srcu; @@ -156,6 +158,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, + BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_MAX_DEPTH = 10240, @@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ + BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; struct request *blk_mq_alloc_request(struct request_queue *q, int rw, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2e99d659b0f1..25564857f5f8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -154,6 +154,7 @@ struct request { /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ + int tag; sector_t __sector; /* sector cursor */ struct bio *bio; @@ -220,9 +221,10 @@ struct request { unsigned short ioprio; + int internal_tag; + void *special; /* opaque pointer available for LLD use */ - int tag; int errors; /* diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2a9e966eed03..ecb96fd67c6d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -77,6 +77,34 @@ struct elevator_ops elevator_registered_fn *elevator_registered_fn; }; +struct blk_mq_alloc_data; +struct blk_mq_hw_ctx; + +struct elevator_mq_ops { + int (*init_sched)(struct request_queue *, struct elevator_type *); + void (*exit_sched)(struct elevator_queue *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); + int (*request_merge)(struct request_queue *q, struct request **, struct bio *); + void (*request_merged)(struct request_queue *, struct request *, int); + void (*requests_merged)(struct request_queue *, struct request *, struct request *); + struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); + void (*put_request)(struct request *); + void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); + void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + bool (*has_work)(struct blk_mq_hw_ctx *); + void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*started_request)(struct request *); + void (*requeue_request)(struct request *); + struct request *(*former_request)(struct request_queue *, struct request *); + struct request *(*next_request)(struct request_queue *, struct request *); + int (*get_rq_priv)(struct request_queue *, struct request *); + void (*put_rq_priv)(struct request_queue *, struct request *); + void (*init_icq)(struct io_cq *); + void (*exit_icq)(struct io_cq *); +}; + #define ELV_NAME_MAX (16) struct elv_fs_entry { @@ -96,12 +124,14 @@ struct elevator_type /* fields provided by elevator implementation */ union { struct elevator_ops sq; + struct elevator_mq_ops mq; } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; + bool uses_mq; /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ @@ -125,6 +155,7 @@ struct elevator_queue struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; + unsigned int uses_mq:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; @@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *, extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -- cgit v1.2.3 From d34849913819a5e0cbfbe724dbe79df89278c524 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jan 2017 14:43:58 -0700 Subject: blk-mq-sched: allow setting of default IO scheduler Add Kconfig entries to manage what devices get assigned an MQ scheduler, and add a blk-mq flag for drivers to opt out of scheduling. The latter is useful for admin type queues that still allocate a blk-mq queue and tag set, but aren't use for normal IO. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 63569eb46d15..8e4df3d6c8cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -153,6 +153,7 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, -- cgit v1.2.3 From 07e4fead45e6e1932f0b960655ab554b6aab6a08 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:40 -0800 Subject: blk-mq: create debugfs directory tree In preparation for putting blk-mq debugging information in debugfs, create a directory tree mirroring the one in sysfs: # tree -d /sys/kernel/debug/block /sys/kernel/debug/block |-- nvme0n1 | `-- mq | |-- 0 | | `-- cpu0 | |-- 1 | | `-- cpu1 | |-- 2 | | `-- cpu2 | `-- 3 | `-- cpu3 `-- vda `-- mq `-- 0 |-- cpu0 |-- cpu1 |-- cpu2 `-- cpu3 Also add the scaffolding for the actual files that will go in here, either under the hardware queue or software queue directories. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25564857f5f8..0ee283f3cffe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,11 @@ struct request_queue { struct list_head tag_set_list; struct bio_set *bio_split; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *mq_debugfs_dir; +#endif + bool mq_sysfs_init_done; }; -- cgit v1.2.3 From 24af1ccfe12adddbe17d11801e1689791a4cc282 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 14:32:13 -0800 Subject: sbitmap: add helpers for dumping to a seq_file This is useful debugging information that will be used in the blk-mq debugfs directory. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Changed 'weight' to 'busy'. Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index f017fd6e69c4..d4e0a204c118 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -258,6 +258,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) unsigned int sbitmap_weight(const struct sbitmap *sb); +/** + * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_show(struct sbitmap *sb, struct seq_file *m); + +/** + * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct + * seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The output isn't guaranteed to be internally + * consistent. + */ +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); + /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. @@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); +/** + * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct + * seq_file. + * @sbq: Bitmap queue to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); + #endif /* __LINUX_SCALE_BITMAP_H */ -- cgit v1.2.3 From 50e1dab86aa2c10cbca2f754aae9542169403141 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 14:42:34 -0700 Subject: blk-mq-sched: fix starvation for multiple hardware queues and shared tags If we have both multiple hardware queues and shared tag map between devices, we need to ensure that we propagate the hardware queue restart bit higher up. This is because we can get into a situation where we don't have any IO pending on a hardware queue, yet we fail getting a tag to start new IO. If that happens, it's not enough to mark the hardware queue as needing a restart, we need to bubble that up to the higher level queue as well. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee283f3cffe..883b8abe4305 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -607,6 +607,7 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From c13660a08c8b3bb49def4374bfd414aaaa564662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:40:07 -0700 Subject: blk-mq-sched: change ->dispatch_requests() to ->dispatch_request() When we invoke dispatch_requests(), the scheduler empties everything into the passed in list. This isn't always a good thing, since it means that we remove items that we could have potentially merged with. Change the function to dispatch single requests at the time. If we do that, we can backoff exactly at the point where the device can't consume more IO, and leave the rest with the scheduler for better merging and future dispatch decision making. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ecb96fd67c6d..b5825c4f06f7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -92,7 +92,7 @@ struct elevator_mq_ops { struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); void (*put_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); - void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); void (*started_request)(struct request *); -- cgit v1.2.3 From f73f44eb00cb136990cfb7d40e436c13d7669ec8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 08:30:47 -0700 Subject: block: add a op_is_flush helper This centralizes the checks for bios that needs to be go into the flush state machine. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0e5b1cd5113c..37c9a43c5e78 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,6 +220,15 @@ static inline bool op_is_write(unsigned int op) return (op & 1); } +/* + * Check if the bio or request is one that needs special treatment in the + * flush state machine. + */ +static inline bool op_is_flush(unsigned int op) +{ + return op & (REQ_FUA | REQ_PREFLUSH); +} + /* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the -- cgit v1.2.3 From 5ea708d15a928f7a479987704203616d3274c03b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2017 14:52:44 +0300 Subject: block: simplify blk_init_allocated_queue Return an errno value instead of the passed in queue so that the callers don't have to keep track of two queues, and move the assignment of the request_fn and lock to the caller as passing them as argument doesn't simplify anything. While we're at it also remove two pointless NULL assignments, given that the request structure is zeroed on allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 05675b1dfd20..6b1efc5760ea 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1137,8 +1137,7 @@ extern void blk_unprep_request(struct request *); extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id); extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); -extern struct request_queue *blk_init_allocated_queue(struct request_queue *, - request_fn_proc *, spinlock_t *); +extern int blk_init_allocated_queue(struct request_queue *); extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); -- cgit v1.2.3 From 6d247d7f71d1fa4b66a5f4da7b1daa21510d529b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 09:51:45 -0700 Subject: block: allow specifying size for extra command data This mirrors the blk-mq capabilities to allocate extra drivers-specific data behind struct request by setting a cmd_size field, as well as having a constructor / destructor for it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6b1efc5760ea..461b7cf6af1d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -273,6 +273,8 @@ typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); typedef int (lld_busy_fn) (struct request_queue *q); typedef int (bsg_job_fn) (struct bsg_job *); +typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t); +typedef void (exit_rq_fn)(struct request_queue *, struct request *); enum blk_eh_timer_return { BLK_EH_NOT_HANDLED, @@ -408,6 +410,8 @@ struct request_queue { rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; + init_rq_fn *init_rq_fn; + exit_rq_fn *exit_rq_fn; const struct blk_mq_ops *mq_ops; @@ -577,6 +581,9 @@ struct request_queue { #endif bool mq_sysfs_init_done; + + size_t cmd_size; + void *rq_alloc_data; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From 48b77ad6084481ef9330a5d2bee289966da0975b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 09:35:54 +0100 Subject: block: cleanup tracing A couple tweaks to the tracing code: - trace the request size for all requests - trace request sector and nr_sectors only for fs requests, enforced by helpers - drop SCSI CDB tracing - we have SCSI tracing for this and are going to me the CDB out of the generic struct request soon. With this the tracing code stops to know about BLOCK_PC requests entirely, it's just FS vs passthrough requests now, where the latter includes any driver-private requests. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 14 +++++++------- include/trace/events/block.h | 27 +++++++++++---------------- 2 files changed, 18 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e417f080219a..a143a67a6f33 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -110,16 +110,16 @@ struct compat_blk_user_trace_setup { #endif -#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK) +extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); -static inline int blk_cmd_buf_len(struct request *rq) +static inline sector_t blk_rq_trace_sector(struct request *rq) { - return (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? rq->cmd_len * 3 : 1; + return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq); } -extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); - -#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ +static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq) +{ + return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq); +} #endif diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 3e02e3a25413..a88ed13446ff 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -73,19 +73,17 @@ DECLARE_EVENT_CLASS(block_rq_with_error, __field( unsigned int, nr_sector ) __field( int, errors ) __array( char, rwbs, RWBS_LEN ) - __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; - __entry->sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? - 0 : blk_rq_pos(rq); - __entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? - 0 : blk_rq_sectors(rq); + __entry->sector = blk_rq_trace_sector(rq); + __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->errors = rq->errors; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); - blk_dump_cmd(__get_str(cmd), rq); + __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", @@ -153,7 +151,7 @@ TRACE_EVENT(block_rq_complete, __field( unsigned int, nr_sector ) __field( int, errors ) __array( char, rwbs, RWBS_LEN ) - __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( @@ -163,7 +161,7 @@ TRACE_EVENT(block_rq_complete, __entry->errors = rq->errors; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); - blk_dump_cmd(__get_str(cmd), rq); + __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", @@ -186,20 +184,17 @@ DECLARE_EVENT_CLASS(block_rq, __field( unsigned int, bytes ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) - __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; - __entry->sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? - 0 : blk_rq_pos(rq); - __entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? - 0 : blk_rq_sectors(rq); - __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? - blk_rq_bytes(rq) : 0; + __entry->sector = blk_rq_trace_sector(rq); + __entry->nr_sector = blk_rq_trace_nr_sectors(rq); + __entry->bytes = blk_rq_bytes(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); - blk_dump_cmd(__get_str(cmd), rq); + __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), -- cgit v1.2.3 From eb8db831be80692bf4bda3dfc55001daf64ec299 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Jan 2017 18:32:46 +0100 Subject: dm: always defer request allocation to the owner of the request_queue DM already calls blk_mq_alloc_request on the request_queue of the underlying device if it is a blk-mq device. But now that we allow drivers to allocate additional data and initialize it ahead of time we need to do the same for all drivers. Doing so and using the new cmd_size infrastructure in the block layer greatly simplifies the dm-rq and mpath code, and should also make arbitrary combinations of SQ and MQ devices with SQ or MQ device mapper tables easily possible as a further step. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/device-mapper.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ef7962e84444..a7e6903866fd 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -55,8 +55,6 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti); * = 2: The target wants to push back the io */ typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio); -typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, - union map_info *map_context); typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti, struct request *rq, union map_info *map_context, @@ -163,7 +161,6 @@ struct target_type { dm_ctr_fn ctr; dm_dtr_fn dtr; dm_map_fn map; - dm_map_request_fn map_rq; dm_clone_and_map_request_fn clone_and_map_rq; dm_release_clone_request_fn release_clone_rq; dm_endio_fn end_io; -- cgit v1.2.3 From d48777a633d6fa7ccde0f0e6509f0c01fbfc5299 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Jan 2017 21:52:10 +0300 Subject: scsi: remove __scsi_alloc_queue Instead do an internal export of __scsi_init_queue for the transport classes that export BSG nodes. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/scsi/scsi_host.h | 2 -- include/scsi/scsi_transport.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 36680f13270d..f4964d7db313 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -826,8 +826,6 @@ extern void scsi_block_requests(struct Scsi_Host *); struct class_container; -extern struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, - void (*) (struct request_queue *)); /* * These two functions are used to allocate and free a pseudo device * which will connect to the host adapter itself rather than any diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h index 81292392adbc..b6e07b56d013 100644 --- a/include/scsi/scsi_transport.h +++ b/include/scsi/scsi_transport.h @@ -119,4 +119,6 @@ scsi_transport_device_data(struct scsi_device *sdev) + shost->transportt->device_private_offset; } +void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q); + #endif /* SCSI_TRANSPORT_H */ -- cgit v1.2.3 From e9c787e65c0c36529745be47d490d998b4b6e589 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Jan 2017 21:55:26 +0300 Subject: scsi: allocate scsi_cmnd structures as part of struct request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rely on the new block layer functionality to allocate additional driver specific data behind struct request instead of implementing it in SCSI itѕelf. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/scsi/scsi_host.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index f4964d7db313..3cd8c3bec638 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -551,9 +551,6 @@ struct Scsi_Host { struct list_head __devices; struct list_head __targets; - struct scsi_host_cmd_pool *cmd_pool; - spinlock_t free_list_lock; - struct list_head free_list; /* backup store of cmd structs */ struct list_head starved_list; spinlock_t default_lock; -- cgit v1.2.3 From 8ae94eb65be9425af4d57a4f4cfebfdf03081e93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2017 15:25:02 +0300 Subject: block/bsg: move queue creation into bsg_setup_queue Simply the boilerplate code needed for bsg nodes a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/bsg-lib.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index 657a718c27d2..e34dde2da0ef 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -66,9 +66,8 @@ struct bsg_job { void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len); -int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name, - bsg_job_fn *job_fn, int dd_job_size); -void bsg_request_fn(struct request_queue *q); +struct request_queue *bsg_setup_queue(struct device *dev, char *name, + bsg_job_fn *job_fn, int dd_job_size); void bsg_job_put(struct bsg_job *job); int __must_check bsg_job_get(struct bsg_job *job); -- cgit v1.2.3 From 82ed4db499b8598f16f8871261bff088d6b0597f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 09:46:29 +0100 Subject: block: split scsi_request out of struct request And require all drivers that want to support BLOCK_PC to allocate it as the first thing of their private data. To support this the legacy IDE and BSG code is switched to set cmd_size on their queues to let the block layer allocate the additional space. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 ------------- include/linux/ide.h | 8 +++++++- include/scsi/scsi_cmnd.h | 2 ++ include/scsi/scsi_request.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 14 deletions(-) create mode 100644 include/scsi/scsi_request.h (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 461b7cf6af1d..e4c5f284fe2d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -128,8 +128,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) -#define BLK_MAX_CDB 16 - /* * Try to put the fields that are referenced together in the same cacheline. * @@ -227,17 +225,7 @@ struct request { int errors; - /* - * when request is used as a packet command carrier - */ - unsigned char __cmd[BLK_MAX_CDB]; - unsigned char *cmd; - unsigned short cmd_len; - unsigned int extra_len; /* length of alignment and padding */ - unsigned int sense_len; - unsigned int resid_len; /* residual count */ - void *sense; unsigned long deadline; struct list_head timeout_list; @@ -925,7 +913,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); -extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, diff --git a/include/linux/ide.h b/include/linux/ide.h index a633898f36ac..086fbe172817 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -20,6 +20,7 @@ #include /* for request_sense */ #include +#include #include #include @@ -52,6 +53,11 @@ enum ata_cmd_type_bits { ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) +struct ide_request { + struct scsi_request sreq; + u8 sense[SCSI_SENSE_BUFFERSIZE]; +}; + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -579,7 +585,7 @@ struct ide_drive_s { /* current sense rq and buffer */ bool sense_rq_armed; - struct request sense_rq; + struct request *sense_rq; struct request_sense sense_data; }; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 9fc1aecfc813..f708f1acfc50 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -8,6 +8,7 @@ #include #include #include +#include struct Scsi_Host; struct scsi_driver; @@ -57,6 +58,7 @@ struct scsi_pointer { #define SCMD_TAGGED (1 << 0) struct scsi_cmnd { + struct scsi_request req; struct scsi_device *device; struct list_head list; /* scsi_cmnd participates in queue lists */ struct list_head eh_entry; /* entry for the host eh_cmd_q */ diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h new file mode 100644 index 000000000000..ba0aeb980f7e --- /dev/null +++ b/include/scsi/scsi_request.h @@ -0,0 +1,30 @@ +#ifndef _SCSI_SCSI_REQUEST_H +#define _SCSI_SCSI_REQUEST_H + +#include + +#define BLK_MAX_CDB 16 + +struct scsi_request { + unsigned char __cmd[BLK_MAX_CDB]; + unsigned char *cmd; + unsigned short cmd_len; + unsigned int sense_len; + unsigned int resid_len; /* residual count */ + void *sense; +}; + +static inline struct scsi_request *scsi_req(struct request *rq) +{ + return blk_mq_rq_to_pdu(rq); +} + +static inline void scsi_req_free_cmd(struct scsi_request *req) +{ + if (req->cmd != req->__cmd) + kfree(req->cmd); +} + +void scsi_req_init(struct request *); + +#endif /* _SCSI_SCSI_REQUEST_H */ -- cgit v1.2.3 From ade69e2432b795c76653e1dfa09c684549826a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:09 +0100 Subject: lightnvm: merge gennvm with core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the first iteration of Open-Channel SSDs, it was anticipated that there could be various media managers on top of an open-channel SSD, such to allow vendors to plug in their own host-side FTLs, without the media manager in between. Now that an Open-Channel SSD is exposed as a traditional block device, there is no longer a need for this. Therefore lets merge the gennvm code with core and simplify the stack. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 97 ++++-------------------------------------------- 1 file changed, 7 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7c273bbc5351..84309fe27472 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -80,8 +80,6 @@ struct nvm_dev_ops { unsigned int max_phys_sect; }; - - #ifdef CONFIG_NVM #include @@ -272,15 +270,6 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -/* system block cpu representation */ -struct nvm_sb_info { - unsigned long seqnr; - unsigned long erase_cnt; - unsigned int version; - char mmtype[NVM_MMTYPE_LEN]; - struct ppa_addr fs_ppa; -}; - /* Device generic information */ struct nvm_geo { int nr_chnls; @@ -308,6 +297,7 @@ struct nvm_geo { int sec_per_lun; }; +/* sub-device structure */ struct nvm_tgt_dev { /* Device information */ struct nvm_geo geo; @@ -329,17 +319,10 @@ struct nvm_dev { struct list_head devices; - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - /* Device information */ struct nvm_geo geo; - /* lower page table */ + /* lower page table */ int lps_per_blk; int *lptbl; @@ -359,6 +342,10 @@ struct nvm_dev { struct mutex mlock; spinlock_t lock; + + /* target management */ + struct list_head area_list; + struct list_head targets; }; static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, @@ -452,11 +439,6 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) (ppa1.g.blk == ppa2.g.blk)); } -static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) -{ - return dev->lptbl[slc_pg]; -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); @@ -487,49 +469,6 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); -typedef int (nvmm_register_fn)(struct nvm_dev *); -typedef void (nvmm_unregister_fn)(struct nvm_dev *); - -typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); -typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); -typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); -typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); -typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, - struct ppa_addr, int); -typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); - -enum { - TRANS_TGT_TO_DEV = 0x0, - TRANS_DEV_TO_TGT = 0x1, -}; - -struct nvmm_type { - const char *name; - unsigned int version[3]; - - nvmm_register_fn *register_mgr; - nvmm_unregister_fn *unregister_mgr; - - nvmm_create_tgt_fn *create_tgt; - nvmm_remove_tgt_fn *remove_tgt; - - nvmm_submit_io_fn *submit_io; - nvmm_erase_blk_fn *erase_blk; - - nvmm_get_area_fn *get_area; - nvmm_put_area_fn *put_area; - - nvmm_trans_ppa_fn *trans_ppa; - nvmm_part_to_tgt_fn *part_to_tgt; - - struct list_head list; -}; - -extern int nvm_register_mgr(struct nvmm_type *); -extern void nvm_unregister_mgr(struct nvmm_type *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); @@ -559,31 +498,9 @@ extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -/* sysblk.c */ -#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ - -/* system block on disk representation */ -struct nvm_system_block { - __be32 magic; /* magic signature */ - __be32 seqnr; /* sequence number */ - __be32 erase_cnt; /* erase count */ - __be16 version; /* version number */ - u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ - __be64 fs_ppa; /* PPA for media manager - * superblock */ -}; - -extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); - extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ - (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ - (lunid)++, (ppa).g.lun = (lunid)) +extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 10995c3dc9d7f47b92ff3e74b4bd191ddb7991ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:10 +0100 Subject: lightnvm: collapse nvm_erase_ppa and nvm_erase_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After gennvm and core have been merged, there are no more callers to nvm_erase_ppa. Therefore collapse the device specific and target specific erase functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 84309fe27472..f2007b2c4979 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -483,7 +483,6 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From 583b7058b2e8071f59646c8fb027f6c3597417ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:11 +0100 Subject: lightnvm: remove nvm_submit_ppa* functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_submit_ppa* functions are no longer needed after gennvm and core have been merged. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index f2007b2c4979..abb3d55c107f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -489,10 +489,6 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); -extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, - void *, int); -extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, - int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 8f4fe008fb256649bd0e16c96a6eafa3bd916ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:12 +0100 Subject: lightnvm: remove nvm_get_bb_tbl and nvm_set_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the merge of gennvm and core, there is no longer a need for the device specific bad block functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index abb3d55c107f..cad1e1cb0635 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -473,7 +473,6 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); @@ -490,7 +489,6 @@ extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); -extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); extern int nvm_dev_factory(struct nvm_dev *, int flags); -- cgit v1.2.3 From dab8ee9e8a30620a5b5f22d6c0b3749217093803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:14 +0100 Subject: lightnvm: cleanup nvm transformation functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Going from target specific ppa addresses to device was accomplished by first converting target to generic ppa addresses and generic to device addresses. The conversion was either open-coded or used the built-in nvm_trans_* and nvm_map_* functions for conversion. Simplify the interface and cleanup the calls to provide clean functions that now either take a list of ppas or a nvm_rq, and is exposed through: void nvm_ppa_* - target to/from device with a list of PPAs, void nvm_rq_* - target to/from device with a nvm_rq. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cad1e1cb0635..faa0fbfe339a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -378,10 +378,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, return l; } -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; @@ -394,10 +394,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, return l; } -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = 0; @@ -477,8 +477,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); -extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -- cgit v1.2.3 From 19bd6fe73ca812964963aa30827cff9aae64a715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:15 +0100 Subject: lightnvm: reduce number of nvm_id groups to one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of configuration groups has been limited to one in current code, even if there is support for up to four. With the introduction of the open-channel SSD 1.3 specification, only a single group is exposed onwards. Reflect this in the nvm_id structure. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index faa0fbfe339a..ce0b2dac84ac 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -200,11 +200,10 @@ struct nvm_addr_format { struct nvm_id { u8 ver_id; u8 vmnt; - u8 cgrps; u32 cap; u32 dom; struct nvm_addr_format ppaf; - struct nvm_id_group groups[4]; + struct nvm_id_group grp; } __packed; struct nvm_target { -- cgit v1.2.3 From 84d4add793c65b5bda802dcefcf0d7ab1a8e22ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:16 +0100 Subject: lightnvm: add ioctls for vector I/Os MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable user-space to issue vector I/O commands through ioctls. To issue a vector I/O, the ppa list with addresses is also required and must be mapped for the controller to access. For each ioctl, the result and status bits are returned as well, such that user-space can retrieve the open-channel SSD completion bits. The implementation covers the traditional use-cases of bad block management, and vectored read/write/erase. Signed-off-by: Matias Bjørling Metadata implementation, test, and fixes. Signed-off-by: Simon A.F. Lund Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/uapi/linux/lightnvm.h | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 774a43128a7a..fd19f36b3129 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -122,6 +122,44 @@ struct nvm_ioctl_dev_factory { __u32 flags; }; +struct nvm_user_vio { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nppas; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 ppa_list; + __u32 metadata_len; + __u32 data_len; + __u64 status; + __u32 result; + __u32 rsvd3[3]; +}; + +struct nvm_passthru_vio { + __u8 opcode; + __u8 flags; + __u8 rsvd[2]; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u64 ppa_list; + __u16 nppas; + __u16 control; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u64 status; + __u32 result; + __u32 timeout_ms; +}; + /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { /* top level cmds */ @@ -137,6 +175,11 @@ enum { /* Factory reset device */ NVM_DEV_FACTORY_CMD, + + /* Vector user I/O */ + NVM_DEV_VIO_ADMIN_CMD = 0x41, + NVM_DEV_VIO_CMD = 0x42, + NVM_DEV_VIO_USER_CMD = 0x43, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -154,6 +197,13 @@ enum { #define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ struct nvm_ioctl_dev_factory) +#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\ + struct nvm_passthru_vio) +#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\ + struct nvm_user_vio) + #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCHLEVEL 0 -- cgit v1.2.3 From 06894efea706b3cd4ce31e341ec51b4c62c34a86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:17 +0100 Subject: lightnvm: use end_io callback instead of instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the lightnvm core had the "gennvm" layer between the device and the target, there was a need for the core to be able to figure out which target it should send an end_io callback to. Leading to a "double" end_io, first for the media manager instance, and then for the target instance. Now that core and gennvm is merged, there is no longer a need for this, and a single end_io callback will do. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ce0b2dac84ac..17cd454f0d87 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,10 +213,6 @@ struct nvm_target { struct gendisk *disk; }; -struct nvm_tgt_instance { - struct nvm_tgt_type *tt; -}; - #define ADDR_EMPTY (~0ULL) #define NVM_VERSION_MAJOR 1 @@ -227,7 +223,6 @@ struct nvm_rq; typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { - struct nvm_tgt_instance *ins; struct nvm_tgt_dev *dev; struct bio *bio; @@ -251,6 +246,8 @@ struct nvm_rq { u64 ppa_status; /* ppa media status */ int error; + + void *private; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -450,7 +447,6 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -484,7 +480,7 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); -extern void nvm_end_io(struct nvm_rq *, int); +extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 38ea2f7656f815e7330868cbec7bada0fd7933a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 31 Jan 2017 13:17:18 +0100 Subject: lightnvm: Add CRC read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let the host differentiate between a read error and a CRC check error on the device side. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 17cd454f0d87..bc282d26017a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -107,6 +107,7 @@ enum { NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_ERR_FAILCRC = 0x4004, NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ -- cgit v1.2.3 From 9a69b0ed6257ae5e71c99bf21ce53f98c558476a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 31 Jan 2017 13:17:20 +0100 Subject: lightnvm: allow targets to use sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to register through the sysfs interface, a driver needs to know its kobject. On a disk structure, this happens when the partition information is added (device_add_disk), which for lightnvm takes place after the target has been initialized. This means that on target initialization, the kboject has not been created yet. This patch adds a target function to let targets initialize their own kboject as a child of the disk kobject. Signed-off-by: Javier González Added exit typedef and passed gendisk instead of void pointer for exit. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index bc282d26017a..ca45e4a088a9 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -440,6 +440,8 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); +typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); +typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); struct nvm_tgt_type { const char *name; @@ -453,6 +455,10 @@ struct nvm_tgt_type { nvm_tgt_init_fn *init; nvm_tgt_exit_fn *exit; + /* sysfs */ + nvm_tgt_sysfs_init_fn *sysfs_init; + nvm_tgt_sysfs_exit_fn *sysfs_exit; + /* For internal use */ struct list_head list; }; -- cgit v1.2.3 From 57292b58ddb58689e8c3b4c6eadbef10d9ca44dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jan 2017 16:57:29 +0100 Subject: block: introduce blk_rq_is_passthrough This can be used to check for fs vs non-fs requests and basically removes all knowledge of BLOCK_PC specific from the block layer, as well as preparing for removing the cmd_type field in struct request. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 +++++++++++----- include/linux/blktrace_api.h | 4 ++-- include/scsi/scsi_cmnd.h | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e4c5f284fe2d..7121be081517 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -242,6 +242,11 @@ struct request { struct request *next_rq; }; +static inline bool blk_rq_is_passthrough(struct request *rq) +{ + return rq->cmd_type != REQ_TYPE_FS; +} + static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -698,9 +703,10 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) -#define blk_account_rq(rq) \ - (((rq)->rq_flags & RQF_STARTED) && \ - ((rq)->cmd_type == REQ_TYPE_FS)) +static inline bool blk_account_rq(struct request *rq) +{ + return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); +} #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) @@ -775,7 +781,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) static inline bool rq_mergeable(struct request *rq) { - if (rq->cmd_type != REQ_TYPE_FS) + if (blk_rq_is_passthrough(rq)) return false; if (req_op(rq) == REQ_OP_FLUSH) @@ -1049,7 +1055,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, { struct request_queue *q = rq->q; - if (unlikely(rq->cmd_type != REQ_TYPE_FS)) + if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; if (!q->limits.chunk_sectors || diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index a143a67a6f33..341f9418bd68 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -114,12 +114,12 @@ extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); static inline sector_t blk_rq_trace_sector(struct request *rq) { - return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq); + return blk_rq_is_passthrough(rq) ? 0 : blk_rq_pos(rq); } static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq) { - return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq); + return blk_rq_is_passthrough(rq) ? 0 : blk_rq_sectors(rq); } #endif diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index f708f1acfc50..b379f93a2c48 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -151,7 +151,7 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) return cmd + 1; } -/* make sure not to use it with REQ_TYPE_BLOCK_PC commands */ +/* make sure not to use it with passthrough commands */ static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { return *(struct scsi_driver **)cmd->request->rq_disk->private_data; -- cgit v1.2.3 From 2f5a8e80f79dc82e00f4cca557dc9ceaf064b450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jan 2017 16:57:30 +0100 Subject: ide: don't abuse cmd_type Currently the legacy ide driver defines several request types of it's own, which is in the way of removing that field entirely. Instead add a type field to struct ide_request and use that to distinguish the different types of IDE-internal requests. It's a bit of a mess, but so is the surrounding code.. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/ide.h | 56 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ide.h b/include/linux/ide.h index 086fbe172817..5cc6caa94cac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -40,24 +40,58 @@ struct device; -/* IDE-specific values for req->cmd_type */ -enum ata_cmd_type_bits { - REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, - REQ_TYPE_ATA_PC, - REQ_TYPE_ATA_SENSE, /* sense request */ - REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */ - REQ_TYPE_ATA_PM_RESUME, /* resume request */ +/* values for ide_request.type */ +enum ata_priv_type { + ATA_PRIV_MISC, + ATA_PRIV_TASKFILE, + ATA_PRIV_PC, + ATA_PRIV_SENSE, /* sense request */ + ATA_PRIV_PM_SUSPEND, /* suspend request */ + ATA_PRIV_PM_RESUME, /* resume request */ }; -#define ata_pm_request(rq) \ - ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ - (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) - struct ide_request { struct scsi_request sreq; u8 sense[SCSI_SENSE_BUFFERSIZE]; + u8 type; }; +static inline struct ide_request *ide_req(struct request *rq) +{ + return blk_mq_rq_to_pdu(rq); +} + +static inline bool ata_misc_request(struct request *rq) +{ + return rq->cmd_type == REQ_TYPE_DRV_PRIV && + ide_req(rq)->type == ATA_PRIV_MISC; +} + +static inline bool ata_taskfile_request(struct request *rq) +{ + return rq->cmd_type == REQ_TYPE_DRV_PRIV && + ide_req(rq)->type == ATA_PRIV_TASKFILE; +} + +static inline bool ata_pc_request(struct request *rq) +{ + return rq->cmd_type == REQ_TYPE_DRV_PRIV && + ide_req(rq)->type == ATA_PRIV_PC; +} + +static inline bool ata_sense_request(struct request *rq) +{ + return rq->cmd_type == REQ_TYPE_DRV_PRIV && + ide_req(rq)->type == ATA_PRIV_SENSE; +} + +static inline bool ata_pm_request(struct request *rq) +{ + return rq->cmd_type == REQ_TYPE_DRV_PRIV && + (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND || + ide_req(rq)->type == ATA_PRIV_PM_RESUME); +} + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, -- cgit v1.2.3 From aebf526b53aea164508730427597d45f3e06b376 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jan 2017 16:57:31 +0100 Subject: block: fold cmd_type into the REQ_OP_ space Instead of keeping two levels of indirection for requests types, fold it all into the operations. The little caveat here is that previously cmd_type only applied to struct request, while the request and bio op fields were set to plain REQ_OP_READ/WRITE even for passthrough operations. Instead this patch adds new REQ_OP_* for SCSI passthrough and driver private requests, althought it has to add two for each so that we can communicate the data in/out nature of the request. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 +++++++ include/linux/blkdev.h | 22 +++++++++++----------- include/linux/ide.h | 14 +++++--------- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 37c9a43c5e78..d703acb55d0f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,13 @@ enum req_opf { /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 8, + /* SCSI passthrough using struct scsi_request */ + REQ_OP_SCSI_IN = 32, + REQ_OP_SCSI_OUT = 33, + /* Driver private requests */ + REQ_OP_DRV_IN = 34, + REQ_OP_DRV_OUT = 35, + REQ_OP_LAST, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7121be081517..1e947e725528 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -70,15 +70,6 @@ struct request_list { unsigned int flags; }; -/* - * request command types - */ -enum rq_cmd_type_bits { - REQ_TYPE_FS = 1, /* fs request */ - REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_DRV_PRIV, /* driver defined types from here */ -}; - /* * request flags */ typedef __u32 __bitwise req_flags_t; @@ -145,7 +136,6 @@ struct request { struct blk_mq_ctx *mq_ctx; int cpu; - unsigned cmd_type; unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; unsigned long atomic_flags; @@ -242,9 +232,19 @@ struct request { struct request *next_rq; }; +static inline bool blk_rq_is_scsi(struct request *rq) +{ + return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT; +} + +static inline bool blk_rq_is_private(struct request *rq) +{ + return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT; +} + static inline bool blk_rq_is_passthrough(struct request *rq) { - return rq->cmd_type != REQ_TYPE_FS; + return blk_rq_is_scsi(rq) || blk_rq_is_private(rq); } static inline unsigned short req_get_ioprio(struct request *req) diff --git a/include/linux/ide.h b/include/linux/ide.h index 5cc6caa94cac..2f51c1724b5a 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -63,31 +63,27 @@ static inline struct ide_request *ide_req(struct request *rq) static inline bool ata_misc_request(struct request *rq) { - return rq->cmd_type == REQ_TYPE_DRV_PRIV && - ide_req(rq)->type == ATA_PRIV_MISC; + return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_MISC; } static inline bool ata_taskfile_request(struct request *rq) { - return rq->cmd_type == REQ_TYPE_DRV_PRIV && - ide_req(rq)->type == ATA_PRIV_TASKFILE; + return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_TASKFILE; } static inline bool ata_pc_request(struct request *rq) { - return rq->cmd_type == REQ_TYPE_DRV_PRIV && - ide_req(rq)->type == ATA_PRIV_PC; + return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_PC; } static inline bool ata_sense_request(struct request *rq) { - return rq->cmd_type == REQ_TYPE_DRV_PRIV && - ide_req(rq)->type == ATA_PRIV_SENSE; + return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_SENSE; } static inline bool ata_pm_request(struct request *rq) { - return rq->cmd_type == REQ_TYPE_DRV_PRIV && + return blk_rq_is_private(rq) && (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND || ide_req(rq)->type == ATA_PRIV_PM_RESUME); } -- cgit v1.2.3 From d486f1f204382557b5fbcb3ddbb5845cd4ba4e2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jan 2017 12:34:41 -0700 Subject: block: move internal_tag to same cache line as tag Since we removed cmd_type, we now have a hole in the struct. Move the internal_tag member to the same cacheline as tag, since we use them at the same time. This doesn't fix the hole, just moves it elsewhere. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1e947e725528..11f7a8e86a89 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -138,6 +138,9 @@ struct request { int cpu; unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; + + int internal_tag; + unsigned long atomic_flags; /* the following two fields are internal, NEVER access directly */ @@ -209,8 +212,6 @@ struct request { unsigned short ioprio; - int internal_tag; - void *special; /* opaque pointer available for LLD use */ int errors; -- cgit v1.2.3 From f44f1ab5a2dcd4e16eab850fd08e40ff2d0c28d4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:49 +0100 Subject: block: Unhash block device inodes on gendisk destruction Currently, block device inodes stay around after corresponding gendisk hash died until memory reclaim finds them and frees them. Since we will make block device inode pin the bdi, we want to free the block device inode as soon as the device goes away so that bdi does not stay around unnecessarily. Furthermore we need to avoid issues when new device with the same major,minor pair gets created since reusing the bdi structure would be rather difficult in this case. Unhashing block device inode on gendisk destruction nicely deals with these problems. Once last block device inode reference is dropped (which may be directly in del_gendisk()), the inode gets evicted. Furthermore if the major,minor pair gets reallocated, we are guaranteed to get new block device inode even if old block device inode is not yet evicted and thus we avoid issues with possible reuse of bdi. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2ba074328894..702cb6c50194 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2342,6 +2342,7 @@ extern struct kmem_cache *names_cachep; #ifdef CONFIG_BLOCK extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); +extern void bdev_unhash_inode(dev_t dev); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); -- cgit v1.2.3 From dc3b17cc8bf21307c7e076e7c778d5db756f7871 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:50 +0100 Subject: block: Use pointer to backing_dev_info from request_queue We will want to have struct backing_dev_info allocated separately from struct request_queue. As the first step add pointer to backing_dev_info to request_queue and convert all users touching it. No functional changes in this patch. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 11f7a8e86a89..a75e42de34ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -432,7 +432,8 @@ struct request_queue { */ struct delayed_work delay_work; - struct backing_dev_info backing_dev_info; + struct backing_dev_info *backing_dev_info; + struct backing_dev_info _backing_dev_info; /* * The queue owner gets to use this for whatever they like. -- cgit v1.2.3 From d03f6cdc1fc422accb734c7c07a661a0018d8631 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:51 +0100 Subject: block: Dynamically allocate and refcount backing_dev_info Instead of storing backing_dev_info inside struct request_queue, allocate it dynamically, reference count it, and free it when the last reference is dropped. Currently only request_queue holds the reference but in the following patch we add other users referencing backing_dev_info. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ include/linux/backing-dev.h | 10 +++++++++- include/linux/blkdev.h | 1 - 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e850e76acaaf..ad955817916d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -10,6 +10,7 @@ #include #include #include +#include struct page; struct device; @@ -144,6 +145,7 @@ struct backing_dev_info { char *name; + struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 43b93a947e61..efb6ca992d05 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,7 +18,14 @@ #include int __must_check bdi_init(struct backing_dev_info *bdi); -void bdi_exit(struct backing_dev_info *bdi); + +static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) +{ + kref_get(&bdi->refcnt); + return bdi; +} + +void bdi_put(struct backing_dev_info *bdi); __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -29,6 +36,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_destroy(struct backing_dev_info *bdi); +struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a75e42de34ab..e77c1039fd0e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -433,7 +433,6 @@ struct request_queue { struct delayed_work delay_work; struct backing_dev_info *backing_dev_info; - struct backing_dev_info _backing_dev_info; /* * The queue owner gets to use this for whatever they like. -- cgit v1.2.3 From b1d2dc5659b41741f5a29b2ade76ffb4e5bb13d8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:52 +0100 Subject: block: Make blk_get_backing_dev_info() safe without open bdev Currenly blk_get_backing_dev_info() is not safe to be called when the block device is not open as bdev->bd_disk is NULL in that case. However inode_to_bdi() uses this function and may be call called from flusher worker or other writeback related functions without bdev being open which leads to crashes such as: [113031.075540] Unable to handle kernel paging request for data at address 0x00000000 [113031.075614] Faulting instruction address: 0xc0000000003692e0 0:mon> t [c0000000fb65f900] c00000000036cb6c writeback_sb_inodes+0x30c/0x590 [c0000000fb65fa10] c00000000036ced4 __writeback_inodes_wb+0xe4/0x150 [c0000000fb65fa70] c00000000036d33c wb_writeback+0x30c/0x450 [c0000000fb65fb40] c00000000036e198 wb_workfn+0x268/0x580 [c0000000fb65fc50] c0000000000f3470 process_one_work+0x1e0/0x590 [c0000000fb65fce0] c0000000000f38c8 worker_thread+0xa8/0x660 [c0000000fb65fd80] c0000000000fc4b0 kthread+0x110/0x130 [c0000000fb65fe30] c0000000000098f0 ret_from_kernel_thread+0x5c/0x6c Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 702cb6c50194..c930cbc19342 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -423,6 +423,7 @@ struct block_device { int bd_invalidated; struct gendisk * bd_disk; struct request_queue * bd_queue; + struct backing_dev_info *bd_bdi; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device -- cgit v1.2.3 From efa7c9f97e3ef624e9a398bf69c15f58eea9f0e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:53 +0100 Subject: block: Get rid of blk_get_backing_dev_info() blk_get_backing_dev_info() is now a simple dereference. Remove that function and simplify some code around that. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- include/linux/blkdev.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index efb6ca992d05..c52a48cb9a66 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -191,7 +191,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); + return I_BDEV(inode)->bd_bdi; #endif return sb->s_bdi; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e77c1039fd0e..b31137e2afd0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1187,7 +1187,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); -extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { -- cgit v1.2.3 From 0dba1314d4f81115dce711292ec7981d17231064 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 1 Feb 2017 14:05:23 -0800 Subject: scsi, block: fix duplicate bdi name registration crashes Warnings of the following form occur because scsi reuses a devt number while the block layer still has it referenced as the name of the bdi [1]: WARNING: CPU: 1 PID: 93 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x62/0x80 sysfs: cannot create duplicate filename '/devices/virtual/bdi/8:192' [..] Call Trace: dump_stack+0x86/0xc3 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5f/0x80 ? kernfs_path_from_node+0x4f/0x60 sysfs_warn_dup+0x62/0x80 sysfs_create_dir_ns+0x77/0x90 kobject_add_internal+0xb2/0x350 kobject_add+0x75/0xd0 device_add+0x15a/0x650 device_create_groups_vargs+0xe0/0xf0 device_create_vargs+0x1c/0x20 bdi_register+0x90/0x240 ? lockdep_init_map+0x57/0x200 bdi_register_owner+0x36/0x60 device_add_disk+0x1bb/0x4e0 ? __pm_runtime_use_autosuspend+0x5c/0x70 sd_probe_async+0x10d/0x1c0 async_run_entry_fn+0x39/0x170 This is a brute-force fix to pass the devt release information from sd_probe() to the locations where we register the bdi, device_add_disk(), and unregister the bdi, blk_cleanup_queue(). Thanks to Omar for the quick reproducer script [2]. This patch survives where an unmodified kernel fails in a few seconds. [1]: https://marc.info/?l=linux-scsi&m=147116857810716&w=4 [2]: http://marc.info/?l=linux-block&m=148554717109098&w=2 Cc: James Bottomley Cc: Bart Van Assche Cc: "Martin K. Petersen" Cc: Jan Kara Reported-by: Omar Sandoval Tested-by: Omar Sandoval Signed-off-by: Dan Williams Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + include/linux/genhd.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b31137e2afd0..f84fbe55d3b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -433,6 +433,7 @@ struct request_queue { struct delayed_work delay_work; struct backing_dev_info *backing_dev_info; + struct disk_devt *disk_devt; /* * The queue owner gets to use this for whatever they like. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0..a999d281a2f1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -167,6 +167,13 @@ struct blk_integrity { }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ +struct disk_devt { + atomic_t count; + void (*release)(struct disk_devt *disk_devt); +}; + +void put_disk_devt(struct disk_devt *disk_devt); +void get_disk_devt(struct disk_devt *disk_devt); struct gendisk { /* major, first_minor and minors are input parameters only, @@ -176,6 +183,7 @@ struct gendisk { int first_minor; int minors; /* maximum number of minors, =1 for * disks that can't be partitioned. */ + struct disk_devt *disk_devt; char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, umode_t *mode); -- cgit v1.2.3 From a7c5437b0bbec5165df6eb1efc5e37dc40b9e05d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 31 Jan 2017 14:53:17 -0800 Subject: debugfs: add debugfs_lookup() We don't always have easy access to the dentry of a file or directory we created in debugfs. Add a helper which allows us to get a dentry we previously created. The motivation for this change is a problem with blktrace and the blk-mq debugfs entries introduced in 07e4fead45e6 ("blk-mq: create debugfs directory tree"). Namely, in some cases, the directory that blktrace needs to create may already exist, but in other cases, it may not. We _could_ rely on a bunch of implied knowledge to decide whether to create the directory or not, but it's much cleaner on our end to just look it up. Signed-off-by: Omar Sandoval Acked-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- include/linux/debugfs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 014cc564d1c4..c0befcf41b58 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -80,6 +80,8 @@ static const struct file_operations __fops = { \ #if defined(CONFIG_DEBUG_FS) +struct dentry *debugfs_lookup(const char *name, struct dentry *parent); + struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); @@ -181,6 +183,12 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, * want to duplicate the design decision mistakes of procfs and devfs again. */ +static inline struct dentry *debugfs_lookup(const char *name, + struct dentry *parent) +{ + return ERR_PTR(-ENODEV); +} + static inline struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) -- cgit v1.2.3 From 03796c149a99e14506db9de3adba710c26f83ba9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 31 Jan 2017 14:53:18 -0800 Subject: block: fix debugfs config conditional in struct request_queue The debugfs dentries are only used for CONFIG_BLK_DEBUG_FS, so make them conditional on that instead of CONFIG_DEBUG_FS. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f84fbe55d3b3..e0bac14347e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -570,7 +570,7 @@ struct request_queue { struct list_head tag_set_list; struct bio_set *bio_split; -#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; struct dentry *mq_debugfs_dir; #endif -- cgit v1.2.3 From a428d314ebcf65842fd64ad850c02c280586e74d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 31 Jan 2017 14:53:19 -0800 Subject: blktrace: make do_blk_trace_setup() static This isn't used outside of blktrace.c anymore. Fixes: 62c2a7d969f3 ("block: push BKL into blktrace ioctls") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 341f9418bd68..d2e908586e3d 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -30,9 +30,6 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern int do_blk_trace_setup(struct request_queue *q, char *name, - dev_t dev, struct block_device *bdev, - struct blk_user_trace_setup *buts); extern __printf(2, 3) void __trace_note_message(struct blk_trace *, const char *fmt, ...); @@ -80,7 +77,6 @@ extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ # define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) # define blk_trace_shutdown(q) do { } while (0) -# define do_blk_trace_setup(q, name, dev, bdev, buts) (-ENOTTY) # define blk_add_driver_data(q, rq, data, len) do {} while (0) # define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) # define blk_trace_startstop(q, start) (-ENOTTY) -- cgit v1.2.3 From 19641f2d7674fbf2891e9579f61c1b23821086e8 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:30 -0700 Subject: Include: Uapi: Add user ABI for Sed/Opal Signed-off-by: Scott Bauer Signed-off-by: Rafael Antognolli Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 118 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 include/uapi/linux/sed-opal.h (limited to 'include') diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h new file mode 100644 index 000000000000..31799526082a --- /dev/null +++ b/include/uapi/linux/sed-opal.h @@ -0,0 +1,118 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_SED_OPAL_H +#define _UAPI_SED_OPAL_H + +#include + +#define OPAL_KEY_MAX 256 +#define OPAL_MAX_LRS 9 + +enum opal_mbr { + OPAL_MBR_ENABLE = 0x0, + OPAL_MBR_DISABLE = 0x01, +}; + +enum opal_user { + OPAL_ADMIN1 = 0x0, + OPAL_USER1 = 0x01, + OPAL_USER2 = 0x02, + OPAL_USER3 = 0x03, + OPAL_USER4 = 0x04, + OPAL_USER5 = 0x05, + OPAL_USER6 = 0x06, + OPAL_USER7 = 0x07, + OPAL_USER8 = 0x08, + OPAL_USER9 = 0x09, +}; + +enum opal_lock_state { + OPAL_RO = 0x01, /* 0001 */ + OPAL_RW = 0x02, /* 0010 */ + OPAL_LK = 0x04, /* 0100 */ +}; + +struct opal_key { + uint8_t lr; + uint8_t key_len; + char key[OPAL_KEY_MAX]; +}; + +struct opal_lr_act { + int sum; + uint8_t num_lrs; + uint8_t lr[OPAL_MAX_LRS]; + struct opal_key key; +}; + +struct opal_session_info { + int sum; + enum opal_user who; + struct opal_key opal_key; + uint8_t __align[2]; +}; + +struct opal_user_lr_setup { + size_t range_start; + size_t range_length; + int RLE; /* Read Lock enabled */ + int WLE; /* Write Lock Enabled */ + struct opal_session_info session; + uint8_t __align[4]; +}; + +struct opal_lock_unlock { + enum opal_lock_state l_state; + struct opal_session_info session; +}; + +struct opal_new_pw { + struct opal_session_info session; + + /* When we're not operating in sum, and we first set + * passwords we need to set them via ADMIN authority. + * After passwords are changed, we can set them via, + * User authorities. + * Because of this restriction we need to know about + * Two different users. One in 'session' which we will use + * to start the session and new_userr_pw as the user we're + * chaning the pw for. + */ + struct opal_session_info new_user_pw; +}; + +struct opal_mbr_data { + u8 enable_disable; + struct opal_key key; + uint8_t __align[5]; +}; + +#define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) +#define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) +#define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) +#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_key) +#define IOC_OPAL_SET_PW _IOW('p', 224, struct opal_new_pw) +#define IOC_OPAL_ACTIVATE_USR _IOW('p', 225, struct opal_session_info) +#define IOC_OPAL_REVERT_TPR _IOW('p', 226, struct opal_key) +#define IOC_OPAL_LR_SETUP _IOW('p', 227, struct opal_user_lr_setup) +#define IOC_OPAL_ADD_USR_TO_LR _IOW('p', 228, struct opal_lock_unlock) +#define IOC_OPAL_ENABLE_DISABLE_MBR _IOW('p', 229, struct opal_mbr_data) +#define IOC_OPAL_ERASE_LR _IOW('p', 230, struct opal_session_info) +#define IOC_OPAL_SECURE_ERASE_LR _IOW('p', 231, struct opal_session_info) + +#endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 455a7b238cd6bc68c4a550cbbd37c1e22b64f71c Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:31 -0700 Subject: block: Add Sed-opal library This patch implements the necessary logic to bring an Opal enabled drive out of a factory-enabled into a working Opal state. This patch set also enables logic to save a password to be replayed during a resume from suspend. Signed-off-by: Scott Bauer Signed-off-by: Rafael Antognolli Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 178 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 include/linux/sed-opal.h (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h new file mode 100644 index 000000000000..af1a85eae193 --- /dev/null +++ b/include/linux/sed-opal.h @@ -0,0 +1,178 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef LINUX_OPAL_H +#define LINUX_OPAL_H + +#include +#include + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; +struct opal_dev; + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); +typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, + void *buffer, size_t len, bool send); + + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +/** + * struct opal_dev - The structure representing a OPAL enabled SED. + * @supported: Whether or not OPAL is supported on this controller. + * @send_recv: The combined sec_send/sec_recv function pointer. + * @opal_step: A series of opal methods that are necessary to complete a command. + * @func_data: An array of parameters for the opal methods above. + * @state: Describes the current opal_step we're working on. + * @dev_lock: Locks the entire opal_dev structure. + * @parsed: Parsed response from controller. + * @prev_data: Data returned from a method to the controller. + * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. + */ +struct opal_dev { + bool initialized; + bool supported; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + +#ifdef CONFIG_BLK_SED_OPAL +bool opal_unlock_from_suspend(struct opal_dev *dev); +void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); + +static inline bool is_sed_ioctl(unsigned int cmd) +{ + switch (cmd) { + case IOC_OPAL_SAVE: + case IOC_OPAL_LOCK_UNLOCK: + case IOC_OPAL_TAKE_OWNERSHIP: + case IOC_OPAL_ACTIVATE_LSP: + case IOC_OPAL_SET_PW: + case IOC_OPAL_ACTIVATE_USR: + case IOC_OPAL_REVERT_TPR: + case IOC_OPAL_LR_SETUP: + case IOC_OPAL_ADD_USR_TO_LR: + case IOC_OPAL_ENABLE_DISABLE_MBR: + case IOC_OPAL_ERASE_LR: + case IOC_OPAL_SECURE_ERASE_LR: + return true; + } + return false; +} +#else +static inline bool is_sed_ioctl(unsigned int cmd) +{ + return false; +} + +static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, + unsigned long ptr) +{ + return 0; +} +static inline bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + return false; +} +static inline void init_opal_dev(struct opal_dev *opal_dev, + sec_send_recv *send_recv) +{ + opal_dev->supported = false; + opal_dev->initialized = true; +} +#endif /* CONFIG_BLK_SED_OPAL */ +#endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From 8c87fe722053658467bcc9e5ea82051ce3d3a693 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Mon, 6 Feb 2017 17:22:49 -0700 Subject: Fix SED-OPAL UAPI structs to prevent 32/64 bit size differences. This patch is a quick fixup of the user structures that will prevent the structures from being different sizes on 32 and 64 bit archs. Taking this fix will allow us to *NOT* have to do compat ioctls for the sed code. Signed-off-by: Scott Bauer Fixes: 19641f2d7674 ("Include: Uapi: Add user ABI for Sed/Opal") Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 31799526082a..fc06e3a20a51 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -48,37 +48,38 @@ enum opal_lock_state { }; struct opal_key { - uint8_t lr; - uint8_t key_len; - char key[OPAL_KEY_MAX]; + __u8 lr; + __u8 key_len; + __u8 __align[6]; + __u8 key[OPAL_KEY_MAX]; }; struct opal_lr_act { - int sum; - uint8_t num_lrs; - uint8_t lr[OPAL_MAX_LRS]; struct opal_key key; + __u32 sum; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 align[2]; /* Align to 8 byte boundary */ }; struct opal_session_info { - int sum; - enum opal_user who; + __u32 sum; + __u32 who; struct opal_key opal_key; - uint8_t __align[2]; }; struct opal_user_lr_setup { - size_t range_start; - size_t range_length; - int RLE; /* Read Lock enabled */ - int WLE; /* Write Lock Enabled */ + __u64 range_start; + __u64 range_length; + __u32 RLE; /* Read Lock enabled */ + __u32 WLE; /* Write Lock Enabled */ struct opal_session_info session; - uint8_t __align[4]; }; struct opal_lock_unlock { - enum opal_lock_state l_state; struct opal_session_info session; + __u32 l_state; + __u8 __align[4]; }; struct opal_new_pw { @@ -97,9 +98,9 @@ struct opal_new_pw { }; struct opal_mbr_data { - u8 enable_disable; struct opal_key key; - uint8_t __align[5]; + __u8 enable_disable; + __u8 __align[7]; }; #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) -- cgit v1.2.3 From 34fe7c05400663e01e23cddd1fea68bb7a2b3d29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Feb 2017 14:46:48 +0100 Subject: block: enumify ELEVATOR_*_MERGE Switch these constants to an enum, and make let the compiler ensure that all callers of blk_try_merge and elv_merge handle all potential values. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/elevator.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b5825c4f06f7..b38b4e651ea6 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -9,12 +9,21 @@ struct io_cq; struct elevator_type; -typedef int (elevator_merge_fn) (struct request_queue *, struct request **, +/* + * Return values from elevator merger + */ +enum elv_merge { + ELEVATOR_NO_MERGE = 0, + ELEVATOR_FRONT_MERGE = 1, + ELEVATOR_BACK_MERGE = 2, +}; + +typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **, struct bio *); typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *); -typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int); +typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge); typedef int (elevator_allow_bio_merge_fn) (struct request_queue *, struct request *, struct bio *); @@ -87,7 +96,7 @@ struct elevator_mq_ops { bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); - void (*request_merged)(struct request_queue *, struct request *, int); + void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); void (*put_request)(struct request *); @@ -166,10 +175,12 @@ extern void elv_dispatch_sort(struct request_queue *, struct request *); extern void elv_dispatch_add_tail(struct request_queue *, struct request *); extern void elv_add_request(struct request_queue *, struct request *, int); extern void __elv_add_request(struct request_queue *, struct request *, int); -extern int elv_merge(struct request_queue *, struct request **, struct bio *); +extern enum elv_merge elv_merge(struct request_queue *, struct request **, + struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); -extern void elv_merged_request(struct request_queue *, struct request *, int); +extern void elv_merged_request(struct request_queue *, struct request *, + enum elv_merge); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); @@ -218,13 +229,6 @@ extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); -/* - * Return values from elevator merger - */ -#define ELEVATOR_NO_MERGE 0 -#define ELEVATOR_FRONT_MERGE 1 -#define ELEVATOR_BACK_MERGE 2 - /* * Insertion selection */ -- cgit v1.2.3 From 1e739730c5b9ea80a2f25e9cf6e1025d47e3d8ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Feb 2017 14:46:49 +0100 Subject: block: optionally merge discontiguous discard bios into a single request Add a new merge strategy that merges discard bios into a request until the maximum number of discard ranges (or the maximum discard size) is reached from the plug merging code. I/O scheduler merging is not wired up yet but might also be useful, although not for fast devices like NVMe which are the only user for now. Note that for now we don't support limiting the size of each discard range, but if needed that can be added later. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 26 ++++++++++++++++++++++++++ include/linux/elevator.h | 1 + 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e0bac14347e6..aecca0e7d9ca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,6 +331,7 @@ struct queue_limits { unsigned short logical_block_size; unsigned short max_segments; unsigned short max_integrity_segments; + unsigned short max_discard_segments; unsigned char misaligned; unsigned char discard_misaligned; @@ -1146,6 +1147,8 @@ extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); +extern void blk_queue_max_discard_segments(struct request_queue *, + unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1189,6 +1192,15 @@ extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +/* + * Number of physical segments as sent to the device. + * + * Normally this is the number of discontiguous data segments sent by the + * submitter. But for data-less command like discard we might have no + * actual data segments submitted, but the driver might have to add it's + * own special payload. In that case we still return 1 here so that this + * special payload will be mapped. + */ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) @@ -1196,6 +1208,15 @@ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) return rq->nr_phys_segments; } +/* + * Number of discard segments (or ranges) the driver needs to fill in. + * Each discard bio merged into a request is counted as one segment. + */ +static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) +{ + return max_t(unsigned short, rq->nr_phys_segments, 1); +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); @@ -1384,6 +1405,11 @@ static inline unsigned short queue_max_segments(struct request_queue *q) return q->limits.max_segments; } +static inline unsigned short queue_max_discard_segments(struct request_queue *q) +{ + return q->limits.max_discard_segments; +} + static inline unsigned int queue_max_segment_size(struct request_queue *q) { return q->limits.max_segment_size; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b38b4e651ea6..8265b6330cc2 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -16,6 +16,7 @@ enum elv_merge { ELEVATOR_NO_MERGE = 0, ELEVATOR_FRONT_MERGE = 1, ELEVATOR_BACK_MERGE = 2, + ELEVATOR_DISCARD_MERGE = 3, }; typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **, -- cgit v1.2.3 From b35ba01ea6979125e9c23fb322517748278f15e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Feb 2017 14:46:50 +0100 Subject: nvme: support ranged discard requests NVMe supports up to 256 ranges per DSM command, so wire up support for ranged discards up to that limit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d1c6f1b15c9..3e2ed49c3ad8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -553,6 +553,8 @@ enum { NVME_DSMGMT_AD = 1 << 2, }; +#define NVME_DSM_MAX_RANGES 256 + struct nvme_dsm_range { __le32 cattr; __le32 nlb; -- cgit v1.2.3 From f1ba82616c3368e1ae9e64ef29cf3edc1be0860d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 7 Feb 2017 18:24:43 +0100 Subject: blk-mq: pass bio to blk_mq_sched_get_rq_priv bio is used in bfq-mq's get_rq_priv, to get the request group. We could pass directly the group here, but I thought that passing the bio was more general, giving the possibility to get other pieces of information if needed. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 8265b6330cc2..aebecc4ed088 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -109,7 +109,7 @@ struct elevator_mq_ops { void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); - int (*get_rq_priv)(struct request_queue *, struct request *); + int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *); void (*put_rq_priv)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); -- cgit v1.2.3 From 853fe1bf7554155376bb3b231112cdff9ff79177 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Feb 2017 16:25:26 -0800 Subject: cdrom: Make device operations read-only Since function tables are a common target for attackers, it's best to keep them in read-only memory. As such, this makes the CDROM device ops tables const. This drops additionally n_minors, since it isn't used meaningfully, and sets the only user of cdrom_dummy_generic_packet explicitly so the variables can all be const. Inspired by similar changes in grsecurity/PaX. Signed-off-by: Kees Cook Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 8609d577bb66..6e8f209a6dff 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -36,7 +36,7 @@ struct packet_command /* Uniform cdrom data structures for cdrom.c */ struct cdrom_device_info { - struct cdrom_device_ops *ops; /* link to device_ops */ + const struct cdrom_device_ops *ops; /* link to device_ops */ struct list_head list; /* linked list of all device_info */ struct gendisk *disk; /* matching block layer disk */ void *handle; /* driver-dependent data */ @@ -87,7 +87,6 @@ struct cdrom_device_ops { /* driver specifications */ const int capability; /* capability flags */ - int n_minors; /* number of active minor devices */ /* handle uniform packets for scsi type devices (scsi,atapi) */ int (*generic_packet) (struct cdrom_device_info *, struct packet_command *); @@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi, int page_code, int page_control); extern void init_cdrom_command(struct packet_command *cgc, void *buffer, int len, int type); +extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc); /* The SCSI spec says there could be 256 slots. */ #define CDROM_MAX_SLOTS 256 -- cgit v1.2.3 From 24bff4d78a572d25fe2a0818f55bebda8a2d4709 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:35 -0700 Subject: uapi: sed-opal fix IOW for activate lsp to use correct struct The IOC_OPAL_ACTIVATE_LSP took the wrong strcure which would give us the wrong size when using _IOC_SIZE, switch it to the right structure. Fixes: 058f8a2 ("Include: Uapi: Add user ABI for Sed/Opal") Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index fc06e3a20a51..c72e0735532d 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -106,7 +106,7 @@ struct opal_mbr_data { #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) -#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_key) +#define IOC_OPAL_ACTIVATE_LSP _IOW('p', 223, struct opal_lr_act) #define IOC_OPAL_SET_PW _IOW('p', 224, struct opal_new_pw) #define IOC_OPAL_ACTIVATE_USR _IOW('p', 225, struct opal_session_info) #define IOC_OPAL_REVERT_TPR _IOW('p', 226, struct opal_key) -- cgit v1.2.3 From e225c20eb0fd0b6657e640408f11ee392dc82b5b Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:36 -0700 Subject: Move stack parameters for sed_ioctl to prevent oversized stack with CONFIG_KASAN When CONFIG_KASAN is enabled, compilation fails: block/sed-opal.c: In function 'sed_ioctl': block/sed-opal.c:2447:1: error: the frame size of 2256 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Moved all the ioctl structures off the stack and dynamically allocate using _IOC_SIZE() Fixes: 455a7b238cd6 ("block: Add Sed-opal library") Reported-by: Arnd Bergmann Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index af1a85eae193..205d520ea688 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -132,7 +132,7 @@ struct opal_dev { #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); -int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) { @@ -160,7 +160,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) } static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, - unsigned long ptr) + void __user *ioctl_ptr) { return 0; } -- cgit v1.2.3 From 4f1244c8298606b8fae64b4d78b820ae6b896e3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 13:59:39 +0100 Subject: block/sed-opal: allocate struct opal_dev dynamically Insted of bloating the containing structure with it all the time this allocates struct opal_dev dynamically. Additionally this allows moving the definition of struct opal_dev into sed-opal.c. For this a new private data field is added to it that is passed to the send/receive callback. After that a lot of internals can be made private as well. Signed-off-by: Christoph Hellwig Tested-by: Scott Bauer Reviewed-by: Scott Bauer Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 116 ++--------------------------------------------- 1 file changed, 4 insertions(+), 112 deletions(-) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 205d520ea688..deee23d012e7 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -21,117 +21,14 @@ #include #include -/* - * These constant values come from: - * SPC-4 section - * 6.30 SECURITY PROTOCOL IN command / table 265. - */ -enum { - TCG_SECP_00 = 0, - TCG_SECP_01, -}; struct opal_dev; -#define IO_BUFFER_LENGTH 2048 -#define MAX_TOKS 64 - -typedef int (*opal_step)(struct opal_dev *dev); -typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, - void *buffer, size_t len, bool send); - - -enum opal_atom_width { - OPAL_WIDTH_TINY, - OPAL_WIDTH_SHORT, - OPAL_WIDTH_MEDIUM, - OPAL_WIDTH_LONG, - OPAL_WIDTH_TOKEN -}; - -/* - * Token defs derived from: - * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 - * 3.2.2 Data Stream Encoding - */ -enum opal_response_token { - OPAL_DTA_TOKENID_BYTESTRING = 0xe0, - OPAL_DTA_TOKENID_SINT = 0xe1, - OPAL_DTA_TOKENID_UINT = 0xe2, - OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ - OPAL_DTA_TOKENID_INVALID = 0X0 -}; - -/* - * On the parsed response, we don't store again the toks that are already - * stored in the response buffer. Instead, for each token, we just store a - * pointer to the position in the buffer where the token starts, and the size - * of the token in bytes. - */ -struct opal_resp_tok { - const u8 *pos; - size_t len; - enum opal_response_token type; - enum opal_atom_width width; - union { - u64 u; - s64 s; - } stored; -}; - -/* - * From the response header it's not possible to know how many tokens there are - * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later - * if we start dealing with messages that have more than that, we can increase - * this number. This is done to avoid having to make two passes through the - * response, the first one counting how many tokens we have and the second one - * actually storing the positions. - */ -struct parsed_resp { - int num; - struct opal_resp_tok toks[MAX_TOKS]; -}; - -/** - * struct opal_dev - The structure representing a OPAL enabled SED. - * @supported: Whether or not OPAL is supported on this controller. - * @send_recv: The combined sec_send/sec_recv function pointer. - * @opal_step: A series of opal methods that are necessary to complete a command. - * @func_data: An array of parameters for the opal methods above. - * @state: Describes the current opal_step we're working on. - * @dev_lock: Locks the entire opal_dev structure. - * @parsed: Parsed response from controller. - * @prev_data: Data returned from a method to the controller. - * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. - */ -struct opal_dev { - bool initialized; - bool supported; - sec_send_recv *send_recv; - - const opal_step *funcs; - void **func_data; - int state; - struct mutex dev_lock; - u16 comid; - u32 hsn; - u32 tsn; - u64 align; - u64 lowest_lba; - - size_t pos; - u8 cmd[IO_BUFFER_LENGTH]; - u8 resp[IO_BUFFER_LENGTH]; - - struct parsed_resp parsed; - size_t prev_d_len; - void *prev_data; - - struct list_head unlk_lst; -}; +typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer, + size_t len, bool send); #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); -void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) @@ -168,11 +65,6 @@ static inline bool opal_unlock_from_suspend(struct opal_dev *dev) { return false; } -static inline void init_opal_dev(struct opal_dev *opal_dev, - sec_send_recv *send_recv) -{ - opal_dev->supported = false; - opal_dev->initialized = true; -} +#define init_opal_dev(data, send_recv) NULL #endif /* CONFIG_BLK_SED_OPAL */ #endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From 8a9ae523282f324989850fcf41312b42a2fb9296 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 17 Feb 2017 13:59:40 +0100 Subject: nvme: Check for Security send/recv support before issuing commands. We need to verify that the controller supports the security commands before actually trying to issue them. Signed-off-by: Scott Bauer [hch: moved the check so that we don't call into the OPAL code if not supported] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d1c6f1b15c9..00eac863a9c7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -244,6 +244,7 @@ enum { NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, }; struct nvme_lbaf { -- cgit v1.2.3