From 729204ef49ec00b788ce23deb9eb922a5769f55d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 17 Dec 2016 18:49:09 +0800 Subject: block: relax check on sg gap If the last bvec of the 1st bio and the 1st bvec of the next bio are physically contigious, and the latter can be merged to last segment of the 1st bio, we should think they don't violate sg gap(or virt boundary) limit. Both Vitaly and Dexuan reported lots of unmergeable small bios are observed when running mkfs on Hyper-V virtual storage, and performance becomes quite low. This patch fixes that performance issue. The same issue should exist on NVMe, since it sets virt boundary too. Reported-by: Vitaly Kuznetsov Reported-by: Dexuan Cui Tested-by: Dexuan Cui Cc: Keith Busch Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..b20da8dfa7ec 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1607,6 +1607,25 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +/* + * Check if the two bvecs from two bios can be merged to one segment. + * If yes, no need to check gap between the two bios since the 1st bio + * and the 1st bvec in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + return false; + if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, struct bio *next) { @@ -1616,7 +1635,8 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, bio_get_last_bvec(prev, &pb); bio_get_first_bvec(next, &nb); - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + if (!bios_segs_mergeable(q, prev, &pb, &nb)) + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); } return false; -- cgit v1.2.3 From f8a5b12247fe18f7fed801ad262a7ab190e1f848 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Dec 2016 09:24:51 -0700 Subject: blk-mq: make mq_ops a const pointer We never change it, make that clear. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche --- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4a2ab5d99ff7..afc81d77e471 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -60,7 +60,7 @@ struct blk_mq_hw_ctx { struct blk_mq_tag_set { unsigned int *mq_map; - struct blk_mq_ops *ops; + const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b20da8dfa7ec..2e99d659b0f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -407,7 +407,7 @@ struct request_queue { dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; - struct blk_mq_ops *mq_ops; + const struct blk_mq_ops *mq_ops; unsigned int *mq_map; -- cgit v1.2.3 From c51ca6cf545bc51ad38bd50816bde37c647d608d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 10 Dec 2016 15:13:59 -0700 Subject: block: move existing elevator ops to union Prep patch for adding MQ ops as well, since doing anon unions with named initializers doesn't work on older compilers. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/elevator.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b276e9ef0e0b..2a9e966eed03 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -94,7 +94,9 @@ struct elevator_type struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ - struct elevator_ops ops; + union { + struct elevator_ops sq; + } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; -- cgit v1.2.3 From 16a3c2a70cad5ccdc2dc0a4544bff82554807493 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Dec 2016 14:27:46 -0700 Subject: blk-mq: un-export blk_mq_free_hctx_request() It's only used in blk-mq, kill it from the main exported header and kill the symbol export as well. Signed-off-by: Jens Axboe Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index afc81d77e471..2686f9e7302a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -181,7 +181,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { -- cgit v1.2.3 From fd2d332677c687ca90c12a47d6c377c547100b56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Jan 2017 10:04:45 -0700 Subject: blk-mq: add support for carrying internal tag information in blk_qc_t No functional change in this patch, just in preparation for having two types of tags available to the block layer for a single request. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- include/linux/blk_types.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 519ea2c9df61..0e5b1cd5113c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,22 +232,29 @@ static inline bool op_is_sync(unsigned int op) } typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_NONE -1U +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { return cookie != BLK_QC_T_NONE; } -static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num) +static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num, + bool internal) { - return tag | (queue_num << BLK_QC_T_SHIFT); + blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT); + + if (internal) + ret |= BLK_QC_T_INTERNAL; + + return ret; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) { - return cookie >> BLK_QC_T_SHIFT; + return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; } static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) @@ -255,6 +262,11 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +static inline bool blk_qc_t_is_internal(blk_qc_t cookie) +{ + return (cookie & BLK_QC_T_INTERNAL) != 0; +} + struct blk_issue_stat { u64 time; }; -- cgit v1.2.3 From bd166ef183c263c5ced656d49ef19c7da4adc774 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jan 2017 06:03:22 -0700 Subject: blk-mq-sched: add framework for MQ capable IO schedulers This adds a set of hooks that intercepts the blk-mq path of allocating/inserting/issuing/completing requests, allowing us to develop a scheduler within that framework. We reuse the existing elevator scheduler API on the registration side, but augment that with the scheduler flagging support for the blk-mq interfce, and with a separate set of ops hooks for MQ devices. We split driver and scheduler tags, so we can run the scheduling independently of device queue depth. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 5 ++++- include/linux/blkdev.h | 4 +++- include/linux/elevator.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2686f9e7302a..63569eb46d15 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,6 +22,7 @@ struct blk_mq_hw_ctx { unsigned long flags; /* BLK_MQ_F_* flags */ + void *sched_data; struct request_queue *queue; struct blk_flush_queue *fq; @@ -35,6 +36,7 @@ struct blk_mq_hw_ctx { atomic_t wait_index; struct blk_mq_tags *tags; + struct blk_mq_tags *sched_tags; struct srcu_struct queue_rq_srcu; @@ -156,6 +158,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, + BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_MAX_DEPTH = 10240, @@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ + BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; struct request *blk_mq_alloc_request(struct request_queue *q, int rw, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2e99d659b0f1..25564857f5f8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -154,6 +154,7 @@ struct request { /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ + int tag; sector_t __sector; /* sector cursor */ struct bio *bio; @@ -220,9 +221,10 @@ struct request { unsigned short ioprio; + int internal_tag; + void *special; /* opaque pointer available for LLD use */ - int tag; int errors; /* diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2a9e966eed03..ecb96fd67c6d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -77,6 +77,34 @@ struct elevator_ops elevator_registered_fn *elevator_registered_fn; }; +struct blk_mq_alloc_data; +struct blk_mq_hw_ctx; + +struct elevator_mq_ops { + int (*init_sched)(struct request_queue *, struct elevator_type *); + void (*exit_sched)(struct elevator_queue *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); + int (*request_merge)(struct request_queue *q, struct request **, struct bio *); + void (*request_merged)(struct request_queue *, struct request *, int); + void (*requests_merged)(struct request_queue *, struct request *, struct request *); + struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); + void (*put_request)(struct request *); + void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); + void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + bool (*has_work)(struct blk_mq_hw_ctx *); + void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*started_request)(struct request *); + void (*requeue_request)(struct request *); + struct request *(*former_request)(struct request_queue *, struct request *); + struct request *(*next_request)(struct request_queue *, struct request *); + int (*get_rq_priv)(struct request_queue *, struct request *); + void (*put_rq_priv)(struct request_queue *, struct request *); + void (*init_icq)(struct io_cq *); + void (*exit_icq)(struct io_cq *); +}; + #define ELV_NAME_MAX (16) struct elv_fs_entry { @@ -96,12 +124,14 @@ struct elevator_type /* fields provided by elevator implementation */ union { struct elevator_ops sq; + struct elevator_mq_ops mq; } ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; + bool uses_mq; /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ @@ -125,6 +155,7 @@ struct elevator_queue struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; + unsigned int uses_mq:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; @@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *, extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -- cgit v1.2.3 From d34849913819a5e0cbfbe724dbe79df89278c524 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jan 2017 14:43:58 -0700 Subject: blk-mq-sched: allow setting of default IO scheduler Add Kconfig entries to manage what devices get assigned an MQ scheduler, and add a blk-mq flag for drivers to opt out of scheduling. The latter is useful for admin type queues that still allocate a blk-mq queue and tag set, but aren't use for normal IO. Signed-off-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 63569eb46d15..8e4df3d6c8cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -153,6 +153,7 @@ enum { BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, + BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, -- cgit v1.2.3 From 07e4fead45e6e1932f0b960655ab554b6aab6a08 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 08:06:40 -0800 Subject: blk-mq: create debugfs directory tree In preparation for putting blk-mq debugging information in debugfs, create a directory tree mirroring the one in sysfs: # tree -d /sys/kernel/debug/block /sys/kernel/debug/block |-- nvme0n1 | `-- mq | |-- 0 | | `-- cpu0 | |-- 1 | | `-- cpu1 | |-- 2 | | `-- cpu2 | `-- 3 | `-- cpu3 `-- vda `-- mq `-- 0 |-- cpu0 |-- cpu1 |-- cpu2 `-- cpu3 Also add the scaffolding for the actual files that will go in here, either under the hardware queue or software queue directories. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25564857f5f8..0ee283f3cffe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,11 @@ struct request_queue { struct list_head tag_set_list; struct bio_set *bio_split; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *mq_debugfs_dir; +#endif + bool mq_sysfs_init_done; }; -- cgit v1.2.3 From 24af1ccfe12adddbe17d11801e1689791a4cc282 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 25 Jan 2017 14:32:13 -0800 Subject: sbitmap: add helpers for dumping to a seq_file This is useful debugging information that will be used in the blk-mq debugfs directory. Reviewed-by: Hannes Reinecke Signed-off-by: Omar Sandoval Changed 'weight' to 'busy'. Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index f017fd6e69c4..d4e0a204c118 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -258,6 +258,26 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) unsigned int sbitmap_weight(const struct sbitmap *sb); +/** + * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_show(struct sbitmap *sb, struct seq_file *m); + +/** + * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct + * seq_file. + * @sb: Bitmap to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The output isn't guaranteed to be internally + * consistent. + */ +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); + /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. @@ -370,4 +390,14 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); +/** + * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct + * seq_file. + * @sbq: Bitmap queue to show. + * @m: struct seq_file to write to. + * + * This is intended for debugging. The format may change at any time. + */ +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); + #endif /* __LINUX_SCALE_BITMAP_H */ -- cgit v1.2.3 From 50e1dab86aa2c10cbca2f754aae9542169403141 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 14:42:34 -0700 Subject: blk-mq-sched: fix starvation for multiple hardware queues and shared tags If we have both multiple hardware queues and shared tag map between devices, we need to ensure that we propagate the hardware queue restart bit higher up. This is because we can get into a situation where we don't have any IO pending on a hardware queue, yet we fail getting a tag to start new IO. If that happens, it's not enough to mark the hardware queue as needing a restart, we need to bubble that up to the higher level queue as well. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee283f3cffe..883b8abe4305 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -607,6 +607,7 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From c13660a08c8b3bb49def4374bfd414aaaa564662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jan 2017 12:40:07 -0700 Subject: blk-mq-sched: change ->dispatch_requests() to ->dispatch_request() When we invoke dispatch_requests(), the scheduler empties everything into the passed in list. This isn't always a good thing, since it means that we remove items that we could have potentially merged with. Change the function to dispatch single requests at the time. If we do that, we can backoff exactly at the point where the device can't consume more IO, and leave the rest with the scheduler for better merging and future dispatch decision making. Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval Tested-by: Hannes Reinecke --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ecb96fd67c6d..b5825c4f06f7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -92,7 +92,7 @@ struct elevator_mq_ops { struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); void (*put_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); - void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *); + struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); void (*started_request)(struct request *); -- cgit v1.2.3 From f73f44eb00cb136990cfb7d40e436c13d7669ec8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Jan 2017 08:30:47 -0700 Subject: block: add a op_is_flush helper This centralizes the checks for bios that needs to be go into the flush state machine. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0e5b1cd5113c..37c9a43c5e78 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,6 +220,15 @@ static inline bool op_is_write(unsigned int op) return (op & 1); } +/* + * Check if the bio or request is one that needs special treatment in the + * flush state machine. + */ +static inline bool op_is_flush(unsigned int op) +{ + return op & (REQ_FUA | REQ_PREFLUSH); +} + /* * Reads are always treated as synchronous, as are requests with the FUA or * PREFLUSH flag. Other operations may be marked as synchronous using the -- cgit v1.2.3 From ade69e2432b795c76653e1dfa09c684549826a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:09 +0100 Subject: lightnvm: merge gennvm with core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the first iteration of Open-Channel SSDs, it was anticipated that there could be various media managers on top of an open-channel SSD, such to allow vendors to plug in their own host-side FTLs, without the media manager in between. Now that an Open-Channel SSD is exposed as a traditional block device, there is no longer a need for this. Therefore lets merge the gennvm code with core and simplify the stack. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 97 ++++-------------------------------------------- 1 file changed, 7 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7c273bbc5351..84309fe27472 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -80,8 +80,6 @@ struct nvm_dev_ops { unsigned int max_phys_sect; }; - - #ifdef CONFIG_NVM #include @@ -272,15 +270,6 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -/* system block cpu representation */ -struct nvm_sb_info { - unsigned long seqnr; - unsigned long erase_cnt; - unsigned int version; - char mmtype[NVM_MMTYPE_LEN]; - struct ppa_addr fs_ppa; -}; - /* Device generic information */ struct nvm_geo { int nr_chnls; @@ -308,6 +297,7 @@ struct nvm_geo { int sec_per_lun; }; +/* sub-device structure */ struct nvm_tgt_dev { /* Device information */ struct nvm_geo geo; @@ -329,17 +319,10 @@ struct nvm_dev { struct list_head devices; - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - /* Device information */ struct nvm_geo geo; - /* lower page table */ + /* lower page table */ int lps_per_blk; int *lptbl; @@ -359,6 +342,10 @@ struct nvm_dev { struct mutex mlock; spinlock_t lock; + + /* target management */ + struct list_head area_list; + struct list_head targets; }; static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, @@ -452,11 +439,6 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) (ppa1.g.blk == ppa2.g.blk)); } -static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) -{ - return dev->lptbl[slc_pg]; -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); @@ -487,49 +469,6 @@ extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); -typedef int (nvmm_register_fn)(struct nvm_dev *); -typedef void (nvmm_unregister_fn)(struct nvm_dev *); - -typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); -typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); -typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); -typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); -typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, - struct ppa_addr, int); -typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); - -enum { - TRANS_TGT_TO_DEV = 0x0, - TRANS_DEV_TO_TGT = 0x1, -}; - -struct nvmm_type { - const char *name; - unsigned int version[3]; - - nvmm_register_fn *register_mgr; - nvmm_unregister_fn *unregister_mgr; - - nvmm_create_tgt_fn *create_tgt; - nvmm_remove_tgt_fn *remove_tgt; - - nvmm_submit_io_fn *submit_io; - nvmm_erase_blk_fn *erase_blk; - - nvmm_get_area_fn *get_area; - nvmm_put_area_fn *put_area; - - nvmm_trans_ppa_fn *trans_ppa; - nvmm_part_to_tgt_fn *part_to_tgt; - - struct list_head list; -}; - -extern int nvm_register_mgr(struct nvmm_type *); -extern void nvm_unregister_mgr(struct nvmm_type *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); @@ -559,31 +498,9 @@ extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -/* sysblk.c */ -#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ - -/* system block on disk representation */ -struct nvm_system_block { - __be32 magic; /* magic signature */ - __be32 seqnr; /* sequence number */ - __be32 erase_cnt; /* erase count */ - __be16 version; /* version number */ - u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ - __be64 fs_ppa; /* PPA for media manager - * superblock */ -}; - -extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); -extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); - extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ - (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ - (lunid)++, (ppa).g.lun = (lunid)) +extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 10995c3dc9d7f47b92ff3e74b4bd191ddb7991ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:10 +0100 Subject: lightnvm: collapse nvm_erase_ppa and nvm_erase_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After gennvm and core have been merged, there are no more callers to nvm_erase_ppa. Therefore collapse the device specific and target specific erase functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 84309fe27472..f2007b2c4979 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -483,7 +483,6 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From 583b7058b2e8071f59646c8fb027f6c3597417ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:11 +0100 Subject: lightnvm: remove nvm_submit_ppa* functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvm_submit_ppa* functions are no longer needed after gennvm and core have been merged. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index f2007b2c4979..abb3d55c107f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -489,10 +489,6 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); -extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, - void *, int); -extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, - int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 8f4fe008fb256649bd0e16c96a6eafa3bd916ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:12 +0100 Subject: lightnvm: remove nvm_get_bb_tbl and nvm_set_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the merge of gennvm and core, there is no longer a need for the device specific bad block functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index abb3d55c107f..cad1e1cb0635 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -473,7 +473,6 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); @@ -490,7 +489,6 @@ extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); -extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); extern int nvm_dev_factory(struct nvm_dev *, int flags); -- cgit v1.2.3 From dab8ee9e8a30620a5b5f22d6c0b3749217093803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:14 +0100 Subject: lightnvm: cleanup nvm transformation functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Going from target specific ppa addresses to device was accomplished by first converting target to generic ppa addresses and generic to device addresses. The conversion was either open-coded or used the built-in nvm_trans_* and nvm_map_* functions for conversion. Simplify the interface and cleanup the calls to provide clean functions that now either take a list of ppas or a nvm_rq, and is exposed through: void nvm_ppa_* - target to/from device with a list of PPAs, void nvm_rq_* - target to/from device with a nvm_rq. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cad1e1cb0635..faa0fbfe339a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -378,10 +378,10 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, return l; } -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; @@ -394,10 +394,10 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, return l; } -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) +static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr r) { - struct nvm_geo *geo = &dev->geo; + struct nvm_geo *geo = &tgt_dev->geo; struct ppa_addr l; l.ppa = 0; @@ -477,8 +477,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); -extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -- cgit v1.2.3 From 19bd6fe73ca812964963aa30827cff9aae64a715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:15 +0100 Subject: lightnvm: reduce number of nvm_id groups to one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of configuration groups has been limited to one in current code, even if there is support for up to four. With the introduction of the open-channel SSD 1.3 specification, only a single group is exposed onwards. Reflect this in the nvm_id structure. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index faa0fbfe339a..ce0b2dac84ac 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -200,11 +200,10 @@ struct nvm_addr_format { struct nvm_id { u8 ver_id; u8 vmnt; - u8 cgrps; u32 cap; u32 dom; struct nvm_addr_format ppaf; - struct nvm_id_group groups[4]; + struct nvm_id_group grp; } __packed; struct nvm_target { -- cgit v1.2.3 From 06894efea706b3cd4ce31e341ec51b4c62c34a86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 31 Jan 2017 13:17:17 +0100 Subject: lightnvm: use end_io callback instead of instance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the lightnvm core had the "gennvm" layer between the device and the target, there was a need for the core to be able to figure out which target it should send an end_io callback to. Leading to a "double" end_io, first for the media manager instance, and then for the target instance. Now that core and gennvm is merged, there is no longer a need for this, and a single end_io callback will do. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ce0b2dac84ac..17cd454f0d87 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -213,10 +213,6 @@ struct nvm_target { struct gendisk *disk; }; -struct nvm_tgt_instance { - struct nvm_tgt_type *tt; -}; - #define ADDR_EMPTY (~0ULL) #define NVM_VERSION_MAJOR 1 @@ -227,7 +223,6 @@ struct nvm_rq; typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { - struct nvm_tgt_instance *ins; struct nvm_tgt_dev *dev; struct bio *bio; @@ -251,6 +246,8 @@ struct nvm_rq { u64 ppa_status; /* ppa media status */ int error; + + void *private; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -450,7 +447,6 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -484,7 +480,7 @@ extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); -extern void nvm_end_io(struct nvm_rq *, int); +extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -- cgit v1.2.3 From 38ea2f7656f815e7330868cbec7bada0fd7933a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 31 Jan 2017 13:17:18 +0100 Subject: lightnvm: Add CRC read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let the host differentiate between a read error and a CRC check error on the device side. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 17cd454f0d87..bc282d26017a 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -107,6 +107,7 @@ enum { NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_ERR_FAILCRC = 0x4004, NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ -- cgit v1.2.3 From 9a69b0ed6257ae5e71c99bf21ce53f98c558476a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 31 Jan 2017 13:17:20 +0100 Subject: lightnvm: allow targets to use sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to register through the sysfs interface, a driver needs to know its kobject. On a disk structure, this happens when the partition information is added (device_add_disk), which for lightnvm takes place after the target has been initialized. This means that on target initialization, the kboject has not been created yet. This patch adds a target function to let targets initialize their own kboject as a child of the disk kobject. Signed-off-by: Javier González Added exit typedef and passed gendisk instead of void pointer for exit. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index bc282d26017a..ca45e4a088a9 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -440,6 +440,8 @@ typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); +typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); +typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); struct nvm_tgt_type { const char *name; @@ -453,6 +455,10 @@ struct nvm_tgt_type { nvm_tgt_init_fn *init; nvm_tgt_exit_fn *exit; + /* sysfs */ + nvm_tgt_sysfs_init_fn *sysfs_init; + nvm_tgt_sysfs_exit_fn *sysfs_exit; + /* For internal use */ struct list_head list; }; -- cgit v1.2.3 From 455a7b238cd6bc68c4a550cbbd37c1e22b64f71c Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 3 Feb 2017 12:50:31 -0700 Subject: block: Add Sed-opal library This patch implements the necessary logic to bring an Opal enabled drive out of a factory-enabled into a working Opal state. This patch set also enables logic to save a password to be replayed during a resume from suspend. Signed-off-by: Scott Bauer Signed-off-by: Rafael Antognolli Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 178 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 include/linux/sed-opal.h (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h new file mode 100644 index 000000000000..af1a85eae193 --- /dev/null +++ b/include/linux/sed-opal.h @@ -0,0 +1,178 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef LINUX_OPAL_H +#define LINUX_OPAL_H + +#include +#include + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; +struct opal_dev; + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); +typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, + void *buffer, size_t len, bool send); + + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +/** + * struct opal_dev - The structure representing a OPAL enabled SED. + * @supported: Whether or not OPAL is supported on this controller. + * @send_recv: The combined sec_send/sec_recv function pointer. + * @opal_step: A series of opal methods that are necessary to complete a command. + * @func_data: An array of parameters for the opal methods above. + * @state: Describes the current opal_step we're working on. + * @dev_lock: Locks the entire opal_dev structure. + * @parsed: Parsed response from controller. + * @prev_data: Data returned from a method to the controller. + * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. + */ +struct opal_dev { + bool initialized; + bool supported; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + +#ifdef CONFIG_BLK_SED_OPAL +bool opal_unlock_from_suspend(struct opal_dev *dev); +void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); + +static inline bool is_sed_ioctl(unsigned int cmd) +{ + switch (cmd) { + case IOC_OPAL_SAVE: + case IOC_OPAL_LOCK_UNLOCK: + case IOC_OPAL_TAKE_OWNERSHIP: + case IOC_OPAL_ACTIVATE_LSP: + case IOC_OPAL_SET_PW: + case IOC_OPAL_ACTIVATE_USR: + case IOC_OPAL_REVERT_TPR: + case IOC_OPAL_LR_SETUP: + case IOC_OPAL_ADD_USR_TO_LR: + case IOC_OPAL_ENABLE_DISABLE_MBR: + case IOC_OPAL_ERASE_LR: + case IOC_OPAL_SECURE_ERASE_LR: + return true; + } + return false; +} +#else +static inline bool is_sed_ioctl(unsigned int cmd) +{ + return false; +} + +static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, + unsigned long ptr) +{ + return 0; +} +static inline bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + return false; +} +static inline void init_opal_dev(struct opal_dev *opal_dev, + sec_send_recv *send_recv) +{ + opal_dev->supported = false; + opal_dev->initialized = true; +} +#endif /* CONFIG_BLK_SED_OPAL */ +#endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From 853fe1bf7554155376bb3b231112cdff9ff79177 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Feb 2017 16:25:26 -0800 Subject: cdrom: Make device operations read-only Since function tables are a common target for attackers, it's best to keep them in read-only memory. As such, this makes the CDROM device ops tables const. This drops additionally n_minors, since it isn't used meaningfully, and sets the only user of cdrom_dummy_generic_packet explicitly so the variables can all be const. Inspired by similar changes in grsecurity/PaX. Signed-off-by: Kees Cook Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 8609d577bb66..6e8f209a6dff 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -36,7 +36,7 @@ struct packet_command /* Uniform cdrom data structures for cdrom.c */ struct cdrom_device_info { - struct cdrom_device_ops *ops; /* link to device_ops */ + const struct cdrom_device_ops *ops; /* link to device_ops */ struct list_head list; /* linked list of all device_info */ struct gendisk *disk; /* matching block layer disk */ void *handle; /* driver-dependent data */ @@ -87,7 +87,6 @@ struct cdrom_device_ops { /* driver specifications */ const int capability; /* capability flags */ - int n_minors; /* number of active minor devices */ /* handle uniform packets for scsi type devices (scsi,atapi) */ int (*generic_packet) (struct cdrom_device_info *, struct packet_command *); @@ -123,6 +122,8 @@ extern int cdrom_mode_sense(struct cdrom_device_info *cdi, int page_code, int page_control); extern void init_cdrom_command(struct packet_command *cgc, void *buffer, int len, int type); +extern int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc); /* The SCSI spec says there could be 256 slots. */ #define CDROM_MAX_SLOTS 256 -- cgit v1.2.3 From e225c20eb0fd0b6657e640408f11ee392dc82b5b Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 14 Feb 2017 17:29:36 -0700 Subject: Move stack parameters for sed_ioctl to prevent oversized stack with CONFIG_KASAN When CONFIG_KASAN is enabled, compilation fails: block/sed-opal.c: In function 'sed_ioctl': block/sed-opal.c:2447:1: error: the frame size of 2256 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Moved all the ioctl structures off the stack and dynamically allocate using _IOC_SIZE() Fixes: 455a7b238cd6 ("block: Add Sed-opal library") Reported-by: Arnd Bergmann Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index af1a85eae193..205d520ea688 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -132,7 +132,7 @@ struct opal_dev { #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); -int sed_ioctl(struct opal_dev *dev, unsigned int cmd, unsigned long ptr); +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) { @@ -160,7 +160,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) } static inline int sed_ioctl(struct opal_dev *dev, unsigned int cmd, - unsigned long ptr) + void __user *ioctl_ptr) { return 0; } -- cgit v1.2.3 From 4f1244c8298606b8fae64b4d78b820ae6b896e3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Feb 2017 13:59:39 +0100 Subject: block/sed-opal: allocate struct opal_dev dynamically Insted of bloating the containing structure with it all the time this allocates struct opal_dev dynamically. Additionally this allows moving the definition of struct opal_dev into sed-opal.c. For this a new private data field is added to it that is passed to the send/receive callback. After that a lot of internals can be made private as well. Signed-off-by: Christoph Hellwig Tested-by: Scott Bauer Reviewed-by: Scott Bauer Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 116 ++--------------------------------------------- 1 file changed, 4 insertions(+), 112 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 205d520ea688..deee23d012e7 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -21,117 +21,14 @@ #include #include -/* - * These constant values come from: - * SPC-4 section - * 6.30 SECURITY PROTOCOL IN command / table 265. - */ -enum { - TCG_SECP_00 = 0, - TCG_SECP_01, -}; struct opal_dev; -#define IO_BUFFER_LENGTH 2048 -#define MAX_TOKS 64 - -typedef int (*opal_step)(struct opal_dev *dev); -typedef int (sec_send_recv)(struct opal_dev *ctx, u16 spsp, u8 secp, - void *buffer, size_t len, bool send); - - -enum opal_atom_width { - OPAL_WIDTH_TINY, - OPAL_WIDTH_SHORT, - OPAL_WIDTH_MEDIUM, - OPAL_WIDTH_LONG, - OPAL_WIDTH_TOKEN -}; - -/* - * Token defs derived from: - * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 - * 3.2.2 Data Stream Encoding - */ -enum opal_response_token { - OPAL_DTA_TOKENID_BYTESTRING = 0xe0, - OPAL_DTA_TOKENID_SINT = 0xe1, - OPAL_DTA_TOKENID_UINT = 0xe2, - OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ - OPAL_DTA_TOKENID_INVALID = 0X0 -}; - -/* - * On the parsed response, we don't store again the toks that are already - * stored in the response buffer. Instead, for each token, we just store a - * pointer to the position in the buffer where the token starts, and the size - * of the token in bytes. - */ -struct opal_resp_tok { - const u8 *pos; - size_t len; - enum opal_response_token type; - enum opal_atom_width width; - union { - u64 u; - s64 s; - } stored; -}; - -/* - * From the response header it's not possible to know how many tokens there are - * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later - * if we start dealing with messages that have more than that, we can increase - * this number. This is done to avoid having to make two passes through the - * response, the first one counting how many tokens we have and the second one - * actually storing the positions. - */ -struct parsed_resp { - int num; - struct opal_resp_tok toks[MAX_TOKS]; -}; - -/** - * struct opal_dev - The structure representing a OPAL enabled SED. - * @supported: Whether or not OPAL is supported on this controller. - * @send_recv: The combined sec_send/sec_recv function pointer. - * @opal_step: A series of opal methods that are necessary to complete a command. - * @func_data: An array of parameters for the opal methods above. - * @state: Describes the current opal_step we're working on. - * @dev_lock: Locks the entire opal_dev structure. - * @parsed: Parsed response from controller. - * @prev_data: Data returned from a method to the controller. - * @unlk_lst: A list of Locking ranges to unlock on this device during a resume. - */ -struct opal_dev { - bool initialized; - bool supported; - sec_send_recv *send_recv; - - const opal_step *funcs; - void **func_data; - int state; - struct mutex dev_lock; - u16 comid; - u32 hsn; - u32 tsn; - u64 align; - u64 lowest_lba; - - size_t pos; - u8 cmd[IO_BUFFER_LENGTH]; - u8 resp[IO_BUFFER_LENGTH]; - - struct parsed_resp parsed; - size_t prev_d_len; - void *prev_data; - - struct list_head unlk_lst; -}; +typedef int (sec_send_recv)(void *data, u16 spsp, u8 secp, void *buffer, + size_t len, bool send); #ifdef CONFIG_BLK_SED_OPAL bool opal_unlock_from_suspend(struct opal_dev *dev); -void init_opal_dev(struct opal_dev *opal_dev, sec_send_recv *send_recv); +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); static inline bool is_sed_ioctl(unsigned int cmd) @@ -168,11 +65,6 @@ static inline bool opal_unlock_from_suspend(struct opal_dev *dev) { return false; } -static inline void init_opal_dev(struct opal_dev *opal_dev, - sec_send_recv *send_recv) -{ - opal_dev->supported = false; - opal_dev->initialized = true; -} +#define init_opal_dev(data, send_recv) NULL #endif /* CONFIG_BLK_SED_OPAL */ #endif /* LINUX_OPAL_H */ -- cgit v1.2.3 From 8a9ae523282f324989850fcf41312b42a2fb9296 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 17 Feb 2017 13:59:40 +0100 Subject: nvme: Check for Security send/recv support before issuing commands. We need to verify that the controller supports the security commands before actually trying to issue them. Signed-off-by: Scott Bauer [hch: moved the check so that we don't call into the OPAL code if not supported] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3d1c6f1b15c9..00eac863a9c7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -244,6 +244,7 @@ enum { NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, }; struct nvme_lbaf { -- cgit v1.2.3