From 600335205b8d162891b5ef2e32343f5b8020efd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 09:53:52 -0600
Subject: ide: convert to blk-mq

ide-disk and ide-cd tested as working just fine, ide-tape and
ide-floppy haven't. But the latter don't require changes, so they
should work without issue.

Add helper function to insert a request from a work queue, since we
cannot invoke the blk-mq request insertion from IRQ context.

Cc: David Miller <davem@davemloft.net>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ide.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index c74b0321922a..079f8bc0b0f4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -10,7 +10,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/ata.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
@@ -529,6 +529,10 @@ struct ide_drive_s {
 
 	struct request_queue	*queue;	/* request queue */
 
+	int (*prep_rq)(struct ide_drive_s *, struct request *);
+
+	struct blk_mq_tag_set	tag_set;
+
 	struct request		*rq;	/* current request */
 	void		*driver_data;	/* extra driver data */
 	u16			*id;	/* identification info */
@@ -612,6 +616,10 @@ struct ide_drive_s {
 	bool sense_rq_armed;
 	struct request *sense_rq;
 	struct request_sense sense_data;
+
+	/* async sense insertion */
+	struct work_struct rq_work;
+	struct list_head rq_list;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1089,6 +1097,7 @@ extern int ide_pci_clk;
 
 int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int);
 void ide_kill_rq(ide_drive_t *, struct request *);
+void ide_insert_request_head(ide_drive_t *, struct request *);
 
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
@@ -1208,7 +1217,7 @@ extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
 
 extern void ide_timer_expiry(struct timer_list *t);
 extern irqreturn_t ide_intr(int irq, void *dev_id);
-extern void do_ide_request(struct request_queue *);
+extern blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
 extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq);
 
 void ide_init_disk(struct gendisk *, ide_drive_t *);
-- 
cgit v1.2.3


From 9ba20527f4d1430b5f3e5f566be5af3e156a3284 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:15:10 -0600
Subject: blk-mq: provide mq_ops->busy() hook

We'll hook into this from blk_lld_busy(), allowing blk-mq to also
return whether or not a given queue currently has requests in
progress.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2286dc12c6bc..5c8418ebbfd6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -114,6 +114,7 @@ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
+typedef bool (busy_fn)(struct request_queue *);
 
 
 struct blk_mq_ops {
@@ -165,6 +166,11 @@ struct blk_mq_ops {
 	/* Called from inside blk_get_request() */
 	void (*initialize_rq_fn)(struct request *rq);
 
+	/*
+	 * If set, returns whether or not this queue currently is busy
+	 */
+	busy_fn			*busy;
+
 	map_queues_fn		*map_queues;
 
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3


From c6f2882691e8fd128083abdcc3c5aa5b410c2367 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:22:19 -0600
Subject: block: remove q->lld_busy_fn()

Nobody is using the legacy path for blk_lld_busy() anymore, remove
it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..e867733b761d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -320,7 +320,6 @@ typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (lld_busy_fn) (struct request_queue *q);
 typedef int (bsg_job_fn) (struct bsg_job *);
 typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
@@ -466,7 +465,6 @@ struct request_queue {
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
-	lld_busy_fn		*lld_busy_fn;
 	/* Called just after a request is allocated */
 	init_rq_fn		*init_rq_fn;
 	/* Called just before a request is freed */
@@ -1255,7 +1253,6 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
-extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
-- 
cgit v1.2.3


From aae3b069d5ce865ca5ef2902c2a22cef7ab4f3a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:26:25 -0600
Subject: bsg: pass in desired timeout handler

This will ease in the conversion to blk-mq, where we can't set
a timeout handler after queue init.

Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bsg-lib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 6aeaf6472665..b13ae143e7ef 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -72,7 +72,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size);
+		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
-- 
cgit v1.2.3


From 5e28b8d8a1b03ce86f33d38a64a4983d2b5c7679 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:27:02 -0600
Subject: bsg: provide bsg_remove_queue() helper

All drivers do unregister + cleanup, provide a helper for that.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bsg-lib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index b13ae143e7ef..9c9b134b1fa5 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -73,6 +73,7 @@ void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
+void bsg_remove_queue(struct request_queue *q);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
-- 
cgit v1.2.3


From 771a93c489bf486b957c7399f89ee06d43ba2d93 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Oct 2018 05:12:32 -0600
Subject: block: remove blk_complete_request()

It's now unused.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e867733b761d..6baea6563364 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1203,7 +1203,6 @@ extern bool __blk_end_request(struct request *rq, blk_status_t error,
 extern void __blk_end_request_all(struct request *rq, blk_status_t error);
 extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
 
-extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_unprep_request(struct request *);
-- 
cgit v1.2.3


From 7ca01926463a15f5d2681458643b2453930b873a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 03:39:36 -0600
Subject: block: remove legacy rq tagging

It's now unused, kill it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 35 -----------------------------------
 1 file changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6baea6563364..8afe3331777e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SORTED		((__force req_flags_t)(1 << 0))
 /* drive already may have started this one */
 #define RQF_STARTED		((__force req_flags_t)(1 << 1))
-/* uses tagged queueing */
-#define RQF_QUEUED		((__force req_flags_t)(1 << 2))
 /* may not be passed by ioscheduler */
 #define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
 /* request for flush sequence */
@@ -336,15 +334,6 @@ enum blk_queue_state {
 	Queue_up,
 };
 
-struct blk_queue_tag {
-	struct request **tag_index;	/* map of busy tags */
-	unsigned long *tag_map;		/* bit map of free/busy tags */
-	int max_depth;			/* what we will send to device */
-	int real_max_depth;		/* what the array can hold */
-	atomic_t refcnt;		/* map can be shared */
-	int alloc_policy;		/* tag allocation policy */
-	int next_tag;			/* next tag */
-};
 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
@@ -568,8 +557,6 @@ struct request_queue {
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 
-	struct blk_queue_tag	*queue_tags;
-
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
@@ -680,7 +667,6 @@ struct request_queue {
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
 };
 
-#define QUEUE_FLAG_QUEUED	0	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
 #define QUEUE_FLAG_DYING	2	/* queue being torn down */
 #define QUEUE_FLAG_BYPASS	3	/* act as dumb FIFO queue */
@@ -724,7 +710,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 
-#define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
@@ -1359,26 +1344,6 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 		 !list_empty(&plug->cb_list));
 }
 
-/*
- * tag stuff
- */
-extern int blk_queue_start_tag(struct request_queue *, struct request *);
-extern struct request *blk_queue_find_tag(struct request_queue *, int);
-extern void blk_queue_end_tag(struct request_queue *, struct request *);
-extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
-extern void blk_queue_free_tags(struct request_queue *);
-extern int blk_queue_resize_tags(struct request_queue *, int);
-extern struct blk_queue_tag *blk_init_tags(int, int);
-extern void blk_free_tags(struct blk_queue_tag *);
-
-static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
-						int tag)
-{
-	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
-		return NULL;
-	return bqt->tag_index[tag];
-}
-
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
-- 
cgit v1.2.3


From a1ce35fa49852db60fc6e268038530be533c5b15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:23:51 -0600
Subject: block: remove dead elevator code

This removes a bunch of core and elevator related code. On the core
front, we remove anything related to queue running, draining,
initialization, plugging, and congestions. We also kill anything
related to request allocation, merging, retrieval, and completion.

Remove any checking for single queue IO schedulers, as they no
longer exist. This means we can also delete a bunch of code related
to request issue, adding, completion, etc - and all the SQ related
ops and helpers.

Also kill the load_default_modules(), as all that did was provide
for a way to load the default single queue elevator.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h   | 93 ++----------------------------------------------
 include/linux/elevator.h | 90 +---------------------------------------------
 include/linux/init.h     |  1 -
 3 files changed, 3 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8afe3331777e..a9f6db8abcda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,9 +58,6 @@ struct blk_stat_callback;
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-#define BLK_RL_SYNCFULL		(1U << 0)
-#define BLK_RL_ASYNCFULL	(1U << 1)
-
 struct request_list {
 	struct request_queue	*q;	/* the queue this rl belongs to */
 #ifdef CONFIG_BLK_CGROUP
@@ -309,11 +306,8 @@ static inline unsigned short req_get_ioprio(struct request *req)
 
 struct blk_queue_ctx;
 
-typedef void (request_fn_proc) (struct request_queue *q);
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
-typedef int (prep_rq_fn) (struct request_queue *, struct request *);
-typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
@@ -432,8 +426,6 @@ struct request_queue {
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	struct elevator_queue	*elevator;
-	int			nr_rqs[2];	/* # allocated [a]sync rqs */
-	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
 	struct blk_queue_stats	*stats;
 	struct rq_qos		*rq_qos;
@@ -446,11 +438,8 @@ struct request_queue {
 	 */
 	struct request_list	root_rl;
 
-	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
-	prep_rq_fn		*prep_rq_fn;
-	unprep_rq_fn		*unprep_rq_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
@@ -458,8 +447,6 @@ struct request_queue {
 	init_rq_fn		*init_rq_fn;
 	/* Called just before a request is freed */
 	exit_rq_fn		*exit_rq_fn;
-	/* Called from inside blk_get_request() */
-	void (*initialize_rq_fn)(struct request *rq);
 
 	const struct blk_mq_ops	*mq_ops;
 
@@ -475,17 +462,6 @@ struct request_queue {
 	struct blk_mq_hw_ctx	**queue_hw_ctx;
 	unsigned int		nr_hw_queues;
 
-	/*
-	 * Dispatch queue sorting
-	 */
-	sector_t		end_sector;
-	struct request		*boundary_rq;
-
-	/*
-	 * Delayed queue handling
-	 */
-	struct delayed_work	delay_work;
-
 	struct backing_dev_info	*backing_dev_info;
 
 	/*
@@ -548,9 +524,6 @@ struct request_queue {
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
-	unsigned int		nr_congestion_on;
-	unsigned int		nr_congestion_off;
-	unsigned int		nr_batching;
 
 	unsigned int		dma_drain_size;
 	void			*dma_drain_buffer;
@@ -560,13 +533,6 @@ struct request_queue {
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
-	/*
-	 * Number of active block driver functions for which blk_drain_queue()
-	 * must wait. Must be incremented around functions that unlock the
-	 * queue_lock internally, e.g. scsi_request_fn().
-	 */
-	unsigned int		request_fn_active;
-
 	unsigned int		rq_timeout;
 	int			poll_nsec;
 
@@ -740,11 +706,6 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
 
-static inline int queue_in_flight(struct request_queue *q)
-{
-	return q->in_flight[0] + q->in_flight[1];
-}
-
 static inline bool blk_account_rq(struct request *rq)
 {
 	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
@@ -765,7 +726,7 @@ static inline bool blk_account_rq(struct request *rq)
  */
 static inline bool queue_is_rq_based(struct request_queue *q)
 {
-	return q->request_fn || q->mq_ops;
+	return q->mq_ops;
 }
 
 static inline unsigned int blk_queue_cluster(struct request_queue *q)
@@ -828,27 +789,6 @@ static inline bool rq_is_sync(struct request *rq)
 	return op_is_sync(rq->cmd_flags);
 }
 
-static inline bool blk_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	return rl->flags & flag;
-}
-
-static inline void blk_set_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	rl->flags |= flag;
-}
-
-static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	rl->flags &= ~flag;
-}
-
 static inline bool rq_mergeable(struct request *rq)
 {
 	if (blk_rq_is_passthrough(rq))
@@ -969,7 +909,6 @@ extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       blk_mq_req_flags_t flags);
-extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
@@ -979,7 +918,6 @@ extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
-extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
@@ -992,15 +930,7 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
-extern void blk_start_queue(struct request_queue *q);
-extern void blk_start_queue_async(struct request_queue *q);
-extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-extern void __blk_stop_queue(struct request_queue *q);
-extern void __blk_run_queue(struct request_queue *q);
-extern void __blk_run_queue_uncond(struct request_queue *q);
-extern void blk_run_queue(struct request_queue *);
-extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
@@ -1155,13 +1085,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
 	return nr_bios;
 }
 
-/*
- * Request issue related functions.
- */
-extern struct request *blk_peek_request(struct request_queue *q);
-extern void blk_start_request(struct request *rq);
-extern struct request *blk_fetch_request(struct request_queue *q);
-
 void blk_steal_bios(struct bio_list *list, struct request *rq);
 
 /*
@@ -1179,9 +1102,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq);
  */
 extern bool blk_update_request(struct request *rq, blk_status_t error,
 			       unsigned int nr_bytes);
-extern void blk_finish_request(struct request *rq, blk_status_t error);
-extern bool blk_end_request(struct request *rq, blk_status_t error,
-			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, blk_status_t error);
 extern bool __blk_end_request(struct request *rq, blk_status_t error,
 			      unsigned int nr_bytes);
@@ -1190,15 +1110,10 @@ extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
 
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-extern void blk_unprep_request(struct request *);
 
 /*
  * Access functions for manipulating queue properties
  */
-extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
-					spinlock_t *lock, int node_id);
-extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
-extern int blk_init_allocated_queue(struct request_queue *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
@@ -1239,8 +1154,6 @@ extern int blk_queue_dma_drain(struct request_queue *q,
 			       void *buf, unsigned int size);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
-extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
-extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
@@ -1298,7 +1211,6 @@ extern void blk_set_queue_dying(struct request_queue *);
  * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
-	struct list_head list; /* requests */
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
 };
@@ -1339,8 +1251,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	struct blk_plug *plug = tsk->plug;
 
 	return plug &&
-		(!list_empty(&plug->list) ||
-		 !list_empty(&plug->mq_list) ||
+		 (!list_empty(&plug->mq_list) ||
 		 !list_empty(&plug->cb_list));
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 015bb59c0331..158004f1754d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -23,74 +23,6 @@ enum elv_merge {
 	ELEVATOR_DISCARD_MERGE	= 3,
 };
 
-typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **,
-				 struct bio *);
-
-typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *);
-
-typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge);
-
-typedef int (elevator_allow_bio_merge_fn) (struct request_queue *,
-					   struct request *, struct bio *);
-
-typedef int (elevator_allow_rq_merge_fn) (struct request_queue *,
-					  struct request *, struct request *);
-
-typedef void (elevator_bio_merged_fn) (struct request_queue *,
-						struct request *, struct bio *);
-
-typedef int (elevator_dispatch_fn) (struct request_queue *, int);
-
-typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
-typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
-typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int);
-
-typedef void (elevator_init_icq_fn) (struct io_cq *);
-typedef void (elevator_exit_icq_fn) (struct io_cq *);
-typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
-				   struct bio *, gfp_t);
-typedef void (elevator_put_req_fn) (struct request *);
-typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
-typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
-
-typedef int (elevator_init_fn) (struct request_queue *,
-				struct elevator_type *e);
-typedef void (elevator_exit_fn) (struct elevator_queue *);
-typedef void (elevator_registered_fn) (struct request_queue *);
-
-struct elevator_ops
-{
-	elevator_merge_fn *elevator_merge_fn;
-	elevator_merged_fn *elevator_merged_fn;
-	elevator_merge_req_fn *elevator_merge_req_fn;
-	elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn;
-	elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn;
-	elevator_bio_merged_fn *elevator_bio_merged_fn;
-
-	elevator_dispatch_fn *elevator_dispatch_fn;
-	elevator_add_req_fn *elevator_add_req_fn;
-	elevator_activate_req_fn *elevator_activate_req_fn;
-	elevator_deactivate_req_fn *elevator_deactivate_req_fn;
-
-	elevator_completed_req_fn *elevator_completed_req_fn;
-
-	elevator_request_list_fn *elevator_former_req_fn;
-	elevator_request_list_fn *elevator_latter_req_fn;
-
-	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
-	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */
-
-	elevator_set_req_fn *elevator_set_req_fn;
-	elevator_put_req_fn *elevator_put_req_fn;
-
-	elevator_may_queue_fn *elevator_may_queue_fn;
-
-	elevator_init_fn *elevator_init_fn;
-	elevator_exit_fn *elevator_exit_fn;
-	elevator_registered_fn *elevator_registered_fn;
-};
-
 struct blk_mq_alloc_data;
 struct blk_mq_hw_ctx;
 
@@ -138,16 +70,15 @@ struct elevator_type
 
 	/* fields provided by elevator implementation */
 	union {
-		struct elevator_ops sq;
 		struct elevator_mq_ops mq;
 	} ops;
+
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	const char *elevator_alias;
 	struct module *elevator_owner;
-	bool uses_mq;
 #ifdef CONFIG_BLK_DEBUG_FS
 	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
 	const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
@@ -175,40 +106,25 @@ struct elevator_queue
 	struct kobject kobj;
 	struct mutex sysfs_lock;
 	unsigned int registered:1;
-	unsigned int uses_mq:1;
 	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };
 
 /*
  * block elevator interface
  */
-extern void elv_dispatch_sort(struct request_queue *, struct request *);
-extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
-extern void elv_add_request(struct request_queue *, struct request *, int);
-extern void __elv_add_request(struct request_queue *, struct request *, int);
 extern enum elv_merge elv_merge(struct request_queue *, struct request **,
 		struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *,
 		enum elv_merge);
-extern void elv_bio_merged(struct request_queue *q, struct request *,
-				struct bio *);
 extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
-extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_may_queue(struct request_queue *, unsigned int);
-extern void elv_completed_request(struct request_queue *, struct request *);
-extern int elv_set_request(struct request_queue *q, struct request *rq,
-			   struct bio *bio, gfp_t gfp_mask);
-extern void elv_put_request(struct request_queue *, struct request *);
-extern void elv_drain_elevator(struct request_queue *);
 
 /*
  * io scheduler registration
  */
-extern void __init load_default_elevator_module(void);
 extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
@@ -260,9 +176,5 @@ enum {
 #define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)	list_del_init(&(rq)->queuelist)
 
-#else /* CONFIG_BLOCK */
-
-static inline void load_default_elevator_module(void) { }
-
 #endif /* CONFIG_BLOCK */
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index 9c2aba1dbabf..5255069f5a9f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -146,7 +146,6 @@ extern unsigned int reset_devices;
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
-void __init load_default_modules(void);
 int __init init_rootfs(void);
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
-- 
cgit v1.2.3


From f9cd4bfe96955e7a1d3ec54b393dee87b815ba3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 1 Nov 2018 16:41:41 -0600
Subject: block: get rid of MQ scheduler ops union

This is a remnant of when we had ops for both SQ and MQ
schedulers. Now it's just MQ, so get rid of the union.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 158004f1754d..2e9e2763bf47 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -69,9 +69,7 @@ struct elevator_type
 	struct kmem_cache *icq_cache;
 
 	/* fields provided by elevator implementation */
-	union {
-		struct elevator_mq_ops mq;
-	} ops;
+	struct elevator_mq_ops ops;
 
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
-- 
cgit v1.2.3


From 92bc5a24844ada9b010f03c49a493e3edeadaa54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 13:52:28 -0600
Subject: block: remove __blk_put_request()

Now there's no difference between blk_put_request() and
__blk_put_request() anymore, get rid of the underscore version and
convert the few callers.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a9f6db8abcda..c502a7f40e84 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -906,7 +906,6 @@ extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
-extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       blk_mq_req_flags_t flags);
 extern int blk_lld_busy(struct request_queue *q);
-- 
cgit v1.2.3


From 4316b79e4321d4140164e42f228778e5bc66c84f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:25:07 -0600
Subject: block: kill legacy parts of timeout handling

The only user of legacy timing now is BSG, which is invoked
from the mq timeout handler. Kill the legacy code, and rename
the q->rq_timed_out_fn to q->bsg_job_timeout_fn.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c502a7f40e84..0364fc53f5c8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -441,7 +441,6 @@ struct request_queue {
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
 	softirq_done_fn		*softirq_done_fn;
-	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	/* Called just after a request is allocated */
 	init_rq_fn		*init_rq_fn;
@@ -541,7 +540,6 @@ struct request_queue {
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
-	struct list_head	timeout_list;
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
@@ -601,6 +599,7 @@ struct request_queue {
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
+	rq_timed_out_fn		*bsg_job_timeout_fn;
 	struct bsg_class_device bsg_dev;
 #endif
 
@@ -1156,7 +1155,6 @@ extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
-extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-- 
cgit v1.2.3


From 1028e4b335665290dc563d5272f3c6b84e7fd66e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 09:47:17 -0600
Subject: bsg: move bsg-lib parts outside of request queue

Get rid of the special bsg job fn and timeout handler, move them
into a private bsg_set instead.

Mostly from Christoph, with fixes for error handling and cleanups.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h  | 5 -----
 include/linux/bsg-lib.h | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0364fc53f5c8..877a3d235c45 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,7 +312,6 @@ typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (bsg_job_fn) (struct bsg_job *);
 typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
@@ -321,8 +320,6 @@ enum blk_eh_timer_return {
 	BLK_EH_RESET_TIMER,	/* reset timer and try again */
 };
 
-typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
-
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
@@ -598,8 +595,6 @@ struct request_queue {
 	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
-	bsg_job_fn		*bsg_job_fn;
-	rq_timed_out_fn		*bsg_job_timeout_fn;
 	struct bsg_class_device bsg_dev;
 #endif
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 9c9b134b1fa5..b356e0006731 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -31,6 +31,9 @@ struct device;
 struct scatterlist;
 struct request_queue;
 
+typedef int (bsg_job_fn) (struct bsg_job *);
+typedef enum blk_eh_timer_return (bsg_timeout_fn)(struct request *);
+
 struct bsg_buffer {
 	unsigned int payload_len;
 	int sg_cnt;
@@ -72,7 +75,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
+		bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size);
 void bsg_remove_queue(struct request_queue *q);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
-- 
cgit v1.2.3


From db6d995235606191fa9db0c717e9d843200b71ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Nov 2018 08:46:15 -0600
Subject: block: remove request_list code

It's now dead code, nobody uses it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 97 ----------------------------------------------
 include/linux/blkdev.h     | 34 ----------------
 2 files changed, 131 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6d766a19f2bb..1b299e025e83 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -122,9 +122,6 @@ struct blkcg_gq {
 	/* all non-root blkcg_gq's are guaranteed to have access to parent */
 	struct blkcg_gq			*parent;
 
-	/* request allocation list for this blkcg-q pair */
-	struct request_list		rl;
-
 	/* reference count */
 	atomic_t			refcnt;
 
@@ -515,94 +512,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
 					      (p_blkg)->q, false)))
 
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio)
-{
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-
-	/* bypass blkg lookup and use @q->root_rl directly for root */
-	if (blkcg == &blkcg_root)
-		goto root_rl;
-
-	/*
-	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
-	 * or if either the blkcg or queue is going away.  Fall back to
-	 * root_rl in such cases.
-	 */
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg))
-		goto root_rl;
-
-	blkg_get(blkg);
-	rcu_read_unlock();
-	return &blkg->rl;
-root_rl:
-	rcu_read_unlock();
-	return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-	if (rl->blkg->blkcg != &blkcg_root)
-		blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-	rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-	return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-					 struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
 static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
 	int ret;
@@ -939,12 +848,6 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 877a3d235c45..e0c661a95c39 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,22 +58,6 @@ struct blk_stat_callback;
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-struct request_list {
-	struct request_queue	*q;	/* the queue this rl belongs to */
-#ifdef CONFIG_BLK_CGROUP
-	struct blkcg_gq		*blkg;	/* blkg this request pool belongs to */
-#endif
-	/*
-	 * count[], starved[], and wait[] are indexed by
-	 * BLK_RW_SYNC/BLK_RW_ASYNC
-	 */
-	int			count[2];
-	int			starved[2];
-	mempool_t		*rq_pool;
-	wait_queue_head_t	wait[2];
-	unsigned int		flags;
-};
-
 /*
  * request flags */
 typedef __u32 __bitwise req_flags_t;
@@ -259,10 +243,6 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
-
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-#endif
 };
 
 static inline bool blk_op_is_scsi(unsigned int op)
@@ -312,8 +292,6 @@ typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
-typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
 	BLK_EH_DONE,		/* drivers has completed the command */
@@ -427,22 +405,10 @@ struct request_queue {
 	struct blk_queue_stats	*stats;
 	struct rq_qos		*rq_qos;
 
-	/*
-	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
-	 * is used, root blkg allocates from @q->root_rl and all other
-	 * blkgs from their own blkg->rl.  Which one to use should be
-	 * determined using bio_request_list().
-	 */
-	struct request_list	root_rl;
-
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
 	softirq_done_fn		*softirq_done_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
-	/* Called just after a request is allocated */
-	init_rq_fn		*init_rq_fn;
-	/* Called just before a request is freed */
-	exit_rq_fn		*exit_rq_fn;
 
 	const struct blk_mq_ops	*mq_ops;
 
-- 
cgit v1.2.3


From 7d692330e7cd581ccfee982334bf06b236cb999a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 10:48:12 -0600
Subject: block: get rid of blk_queued_rq()

No point in hiding what this does, just open code it in the
one spot where we are still using it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e0c661a95c39..c675e2b5af62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -673,8 +673,6 @@ static inline bool blk_account_rq(struct request *rq)
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-/* rq->queuelist of dequeued request must be list_empty() */
-#define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
-- 
cgit v1.2.3


From c7bb9ad1744ea14e61e5fff99ee5282709b0c9d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 09:43:30 -0600
Subject: block: get rid of q->softirq_done_fn()

With the legacy path gone, all we do is funnel it through the
mq_ops->complete() operation.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ++-
 include/linux/blkdev.h | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5c8418ebbfd6..9dd574e5436a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -115,6 +115,7 @@ typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
+typedef void (complete_fn)(struct request *);
 
 
 struct blk_mq_ops {
@@ -142,7 +143,7 @@ struct blk_mq_ops {
 	 */
 	poll_fn			*poll;
 
-	softirq_done_fn		*complete;
+	complete_fn		*complete;
 
 	/*
 	 * Called when the block layer side of a hardware queue has been
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c675e2b5af62..d4104844d6bb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -290,7 +290,6 @@ typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 
 struct bio_vec;
-typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 
 enum blk_eh_timer_return {
@@ -407,7 +406,6 @@ struct request_queue {
 
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
-	softirq_done_fn		*softirq_done_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
 	const struct blk_mq_ops	*mq_ops;
@@ -1113,7 +1111,6 @@ extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
-extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
-- 
cgit v1.2.3


From 9cf2bab6307659b940da65d16dcc8f82c69f3a97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 17:01:22 -0600
Subject: block: kill request ->cpu member

This was used for completion placement for the legacy path,
but for mq we have rq->mq_ctx->cpu for that. Add a helper
to get the request CPU assignment, as the mq_ctx type is
private to blk-mq.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 ++
 include/linux/blkdev.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9dd574e5436a..d83a26fb37e5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -300,6 +300,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
+unsigned int blk_mq_rq_cpu(struct request *rq);
+
 /**
  * blk_mq_mark_complete() - Set request state to complete
  * @rq: request to set to complete state
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4104844d6bb..c8fa4d3d7fee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -130,7 +130,6 @@ struct request {
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
-	int cpu;
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 
@@ -669,7 +668,6 @@ static inline bool blk_account_rq(struct request *rq)
 	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
 }
 
-#define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
-- 
cgit v1.2.3


From a8908939af569ce2419f43fd56eeaf003bc3d85d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Oct 2018 14:23:06 -0600
Subject: blk-mq: kill q->mq_map

It's just a pointer to set->mq_map, use that instead. Move the
assignment a bit earlier, so we always know it's valid.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c8fa4d3d7fee..2ae7465d68ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -409,8 +409,6 @@ struct request_queue {
 
 	const struct blk_mq_ops	*mq_ops;
 
-	unsigned int		*mq_map;
-
 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
 	unsigned int		nr_queues;
-- 
cgit v1.2.3


From ed76e329d74a4b15ac0f5fd3adbd52ec0178a134 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:06:14 -0600
Subject: blk-mq: abstract out queue map

This is in preparation for allowing multiple sets of maps per
queue, if so desired.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq-pci.h    |  4 ++--
 include/linux/blk-mq-virtio.h |  4 ++--
 include/linux/blk-mq.h        | 15 ++++++++++++---
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
index 9f4c17f0d2d8..0b1f45c62623 100644
--- a/include/linux/blk-mq-pci.h
+++ b/include/linux/blk-mq-pci.h
@@ -2,10 +2,10 @@
 #ifndef _LINUX_BLK_MQ_PCI_H
 #define _LINUX_BLK_MQ_PCI_H
 
-struct blk_mq_tag_set;
+struct blk_mq_queue_map;
 struct pci_dev;
 
-int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
+int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
 			  int offset);
 
 #endif /* _LINUX_BLK_MQ_PCI_H */
diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h
index 69b4da262c45..687ae287e1dc 100644
--- a/include/linux/blk-mq-virtio.h
+++ b/include/linux/blk-mq-virtio.h
@@ -2,10 +2,10 @@
 #ifndef _LINUX_BLK_MQ_VIRTIO_H
 #define _LINUX_BLK_MQ_VIRTIO_H
 
-struct blk_mq_tag_set;
+struct blk_mq_queue_map;
 struct virtio_device;
 
-int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 		struct virtio_device *vdev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_VIRTIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d83a26fb37e5..176164888628 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -74,10 +74,19 @@ struct blk_mq_hw_ctx {
 	struct srcu_struct	srcu[0];
 };
 
+struct blk_mq_queue_map {
+	unsigned int *mq_map;
+	unsigned int nr_queues;
+};
+
+enum {
+	HCTX_MAX_TYPES = 1,
+};
+
 struct blk_mq_tag_set {
-	unsigned int		*mq_map;
+	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
 	const struct blk_mq_ops	*ops;
-	unsigned int		nr_hw_queues;
+	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
 	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
 	unsigned int		cmd_size;	/* per-request extra data */
@@ -295,7 +304,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set);
+int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
-- 
cgit v1.2.3


From f31967f0e455d08d3ea1d2f849bf62dafc92dbf4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:13:29 -0600
Subject: blk-mq: allow software queue to map to multiple hardware queues

The mapping used to be dependent on just the CPU location, but
now it's a tuple of (type, cpu) instead. This is a prep patch
for allowing a single software queue to map to multiple hardware
queues. No functional changes in this patch.

This changes the software queue count to an unsigned short
to save a bit of space. We can still support 64K-1 CPUs,
which should be enough. Add a check to catch a wrap.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 176164888628..6c39d546c50b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -37,7 +37,8 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	*dispatch_from;
 	unsigned int		dispatch_busy;
 
-	unsigned int		nr_ctx;
+	unsigned short		type;
+	unsigned short		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
 
 	spinlock_t		dispatch_wait_lock;
-- 
cgit v1.2.3


From b3c661b15d5ab11d982e58bee23e05c1780528a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Oct 2018 10:36:06 -0600
Subject: blk-mq: support multiple hctx maps

Add support for the tag set carrying multiple queue maps, and
for the driver to inform blk-mq how many it wishes to support
through setting set->nr_maps.

This adds an mq_ops helper for drivers that support more than 1
map, mq_ops->rq_flags_to_type(). The function takes request/bio
flags and CPU, and returns a queue map index for that. We then
use the type information in blk_mq_map_queue() to index the map
set.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6c39d546c50b..8994c95056a8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,7 +85,14 @@ enum {
 };
 
 struct blk_mq_tag_set {
+	/*
+	 * map[] holds ctx -> hctx mappings, one map exists for each type
+	 * that the driver wishes to support. There are no restrictions
+	 * on maps being of the same size, and it's perfectly legal to
+	 * share maps between types.
+	 */
 	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
+	unsigned int		nr_maps;	/* nr entries in map[] */
 	const struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
 	unsigned int		queue_depth;	/* max hw supported */
@@ -109,6 +116,8 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+/* takes rq->cmd_flags as input, returns a hardware type index */
+typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
@@ -134,6 +143,11 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * Return a queue map type for the given request/bio flags
+	 */
+	rq_flags_to_type_fn	*rq_flags_to_type;
+
 	/*
 	 * Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
-- 
cgit v1.2.3


From ea4f995ee8b8f0578b3319949f2edd5d812fdb0a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 15:06:13 -0600
Subject: blk-mq: cache request hardware queue mapping

We call blk_mq_map_queue() a lot, at least two times for each
request per IO, sometimes more. Since we now have an indirect
call as well in that function. cache the mapping so we don't
have to re-call blk_mq_map_queue() for the same request
multiple times.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2ae7465d68ab..9b1f470cc784 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -129,6 +129,7 @@ enum mq_rq_state {
 struct request {
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
+	struct blk_mq_hw_ctx *mq_hctx;
 
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
-- 
cgit v1.2.3


From 843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 13:16:11 -0600
Subject: blk-mq: initial support for multiple queue maps

Add a queue offset to the tag map. This enables users to map
iteratively, for each queue map type they support.

Bump maximum number of supported maps to 2, we're now fully
able to support more than 1 map.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8994c95056a8..729ce0f00433 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
 struct blk_mq_queue_map {
 	unsigned int *mq_map;
 	unsigned int nr_queues;
+	unsigned int queue_offset;
 };
 
 enum {
-	HCTX_MAX_TYPES = 1,
+	HCTX_MAX_TYPES = 2,
 };
 
 struct blk_mq_tag_set {
-- 
cgit v1.2.3


From d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Aug 2018 10:36:56 -0600
Subject: block: add REQ_HIPRI and inherit it from IOCB_HIPRI

We use IOCB_HIPRI to poll for IO in the caller instead of scheduling.
This information is not available for (or after) IO submission. The
driver may make different queue choices based on the type of IO, so
make the fact that we will poll for this IO known to the lower layers
as well.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1dcf652ba0aa..dbdbfbd6a987 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -323,6 +323,8 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
+	__REQ_HIPRI,
+
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
@@ -343,8 +345,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
-
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+#define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
-- 
cgit v1.2.3


From 4b04cc6a8f86c4842314def22332de1f15de8523 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 5 Nov 2018 12:44:33 -0700
Subject: nvme: add separate poll queue map

Adds support for defining a variable number of poll queues, currently
configurable with the 'poll_queues' module parameter. Defaults to
a single poll queue.

And now we finally have poll support without triggering interrupts!

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 729ce0f00433..9f5e93f40857 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -82,7 +82,7 @@ struct blk_mq_queue_map {
 };
 
 enum {
-	HCTX_MAX_TYPES = 2,
+	HCTX_MAX_TYPES = 3,
 };
 
 struct blk_mq_tag_set {
-- 
cgit v1.2.3


From 7baa85727d0406ffd2b2303cd803a145aa35c505 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 10:24:07 -0700
Subject: blk-mq-tag: change busy_iter_fn to return whether to continue or not

We have this functionality in sbitmap, but we don't export it in
blk-mq for users of the tags busy iteration. This can be useful
for stopping the iteration, if the caller doesn't need to find
more requests.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9f5e93f40857..ff497dfcbbf9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -129,9 +129,9 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int);
 
-typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
+typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
-typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
+typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
-- 
cgit v1.2.3


From ae8799125d565c798e49dcab4bf182dbfc483524 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 09:03:51 -0700
Subject: blk-mq: provide a helper to check if a queue is busy

Returns true if the queue currently has requests pending,
false if not.

DM can use this to replace the atomic_inc/dec they do per device
to see if a device is busy.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ff497dfcbbf9..929e8abc5535 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -250,6 +250,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
+bool blk_mq_queue_busy(struct request_queue *q);
+
 enum {
 	/* return when out of requests */
 	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
-- 
cgit v1.2.3


From 9d037ad707ed6069fbea4e38e6ee37e027b13f1d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 19:37:44 +0100
Subject: block: remove req->timeout_list

Unused now that the legacy request path is gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b1f470cc784..dc2a6f625ecb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -228,8 +228,6 @@ struct request {
 	/* access through blk_rq_set_deadline, blk_rq_deadline */
 	unsigned long __deadline;
 
-	struct list_head timeout_list;
-
 	union {
 		struct __call_single_data csd;
 		u64 fifo_time;
-- 
cgit v1.2.3


From 535ac5d3fe63b9ea1dda379f606f9d0d377d7184 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:35 +0100
Subject: ide: cleanup ->prep_rq calling convention

The return value is just used as a binary yes/no decision, so switch
it to a bool instead of the old BLKPREP_* values returned as an int.

Also clean up a few related comments.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ide.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 079f8bc0b0f4..272704ff21ee 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -529,7 +529,7 @@ struct ide_drive_s {
 
 	struct request_queue	*queue;	/* request queue */
 
-	int (*prep_rq)(struct ide_drive_s *, struct request *);
+	bool (*prep_rq)(struct ide_drive_s *, struct request *);
 
 	struct blk_mq_tag_set	tag_set;
 
-- 
cgit v1.2.3


From 0e17e06cbf7ede285ab74bab44d888b40c21f828 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:41 +0100
Subject: block: remove the BLKPREP_* values.

Unused now.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dc2a6f625ecb..e67ad2dd025e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -776,16 +776,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 	return q->nr_requests;
 }
 
-/*
- * q->prep_rq_fn return values
- */
-enum {
-	BLKPREP_OK,		/* serve it */
-	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
-	BLKPREP_DEFER,		/* leave on queue */
-	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
-};
-
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
 /*
-- 
cgit v1.2.3


From 22ce0a7ccf23d55d1fdaa2974002f8b5ae765665 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:49 +0100
Subject: ide: don't use req->special

Just replace it with a field of the same name in struct ide_req.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ide.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 272704ff21ee..e7d29ae633cd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -50,6 +50,7 @@ struct ide_request {
 	struct scsi_request sreq;
 	u8 sense[SCSI_SENSE_BUFFERSIZE];
 	u8 type;
+	void *special;
 };
 
 static inline struct ide_request *ide_req(struct request *rq)
-- 
cgit v1.2.3


From 7ff4f8035695984c513598e2d49c8277d5d234ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 15:22:49 -0700
Subject: block: remove dead queue members

No more users of ->in_flight[] or ->nr_sorted, get rid of them.

Fixes: a1ce35fa4985 ("block: remove dead elevator code")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e67ad2dd025e..c961329be96b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -486,9 +486,6 @@ struct request_queue {
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 
-	unsigned int		nr_sorted;
-	unsigned int		in_flight[2];
-
 	unsigned int		rq_timeout;
 	int			poll_nsec;
 
-- 
cgit v1.2.3


From 8f4236d9008b0973a8281256ccfde6913cdec6cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:04 +0100
Subject: block: remove QUEUE_FLAG_BYPASS and ->bypass

Unused since the removal of the legacy request code.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 6 +-----
 include/linux/blkdev.h     | 3 ---
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1b299e025e83..2c68efc603bd 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -325,16 +325,12 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
  * @q: request_queue of interest
  *
  * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
+ * under RCU read loc.
  */
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 					   struct request_queue *q)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
 	return __blkg_lookup(blkcg, q, false);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c961329be96b..dd1e53fd4acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -548,7 +548,6 @@ struct request_queue {
 
 	struct mutex		sysfs_lock;
 
-	int			bypass_depth;
 	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
@@ -586,7 +585,6 @@ struct request_queue {
 
 #define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
 #define QUEUE_FLAG_DYING	2	/* queue being torn down */
-#define QUEUE_FLAG_BYPASS	3	/* act as dumb FIFO queue */
 #define QUEUE_FLAG_BIDI		4	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
@@ -630,7 +628,6 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
-#define blk_queue_bypass(q)	test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
 #define blk_queue_init_done(q)	test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
-- 
cgit v1.2.3


From 079076b3416e78ba2bb3ce38e05e320c388c3120 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:05 +0100
Subject: block: remove deadline __deadline manipulation helpers

No users left since the removal of the legacy request interface, we can
remove all the magic bit stealing now and make it a normal field.

But use WRITE_ONCE/READ_ONCE on the new deadline field, given that we
don't seem to have any mechanism to guarantee a new value actually
gets seen by other threads.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dd1e53fd4acf..60507ab7b358 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -224,9 +224,7 @@ struct request {
 	refcount_t ref;
 
 	unsigned int timeout;
-
-	/* access through blk_rq_set_deadline, blk_rq_deadline */
-	unsigned long __deadline;
+	unsigned long deadline;
 
 	union {
 		struct __call_single_data csd;
-- 
cgit v1.2.3


From 57d74df90783f6a6b3e79dfdd2a567ce5db3b790 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:07 +0100
Subject: block: use atomic bitops for ->queue_flags

->queue_flags is generally not set or cleared in the fast path, and also
generally set or cleared one flag at a time.  Make use of the normal
atomic bitops for it so that we don't need to take the queue_lock,
which is otherwise mostly unused in the core block layer now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 60507ab7b358..30d8e0fbd104 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -621,7 +621,6 @@ struct request_queue {
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
-bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
-- 
cgit v1.2.3


From 6d46964230d182c4b6097379738849a809d791dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:18 +0100
Subject: block: remove the lock argument to blk_alloc_queue_node

With the legacy request path gone there is no real need to override the
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 30d8e0fbd104..c4a3a660e3f0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1122,8 +1122,7 @@ extern long nr_blockdev_pages(void);
 
 bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
-					   spinlock_t *lock);
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id);
 extern void blk_put_queue(struct request_queue *);
 extern void blk_set_queue_dying(struct request_queue *);
 
-- 
cgit v1.2.3


From 0d945c1f966b2bcb67bb12be749da0a7fb00201b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Nov 2018 12:17:28 -0700
Subject: block: remove the queue_lock indirection

With the legacy request path gone there is no good reason to keep
queue_lock as a pointer, we can always use the embedded lock now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Fixed floppy and blk-cgroup missing conversions and half done edits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 4 ++--
 include/linux/blkdev.h     | 8 +-------
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 2c68efc603bd..a9e2e2037129 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -717,11 +717,11 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 	}
 
 	throtl = blk_throtl_bio(q, blkg, bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c4a3a660e3f0..1d185f1fc333 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -446,13 +446,7 @@ struct request_queue {
 	 */
 	gfp_t			bounce_gfp;
 
-	/*
-	 * protects queue structures from reentrancy. ->__queue_lock should
-	 * _never_ be used directly, it is queue private. always use
-	 * ->queue_lock.
-	 */
-	spinlock_t		__queue_lock;
-	spinlock_t		*queue_lock;
+	spinlock_t		queue_lock;
 
 	/*
 	 * queue kobject
-- 
cgit v1.2.3


From 344e9ffcbd1898e1dc04085564a6e05c30ea8199 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 12:22:51 -0700
Subject: block: add queue_is_mq() helper

Various spots check for q->mq_ops being non-NULL, but provide
a helper to do this instead.

Where the ->mq_ops != NULL check is redundant, remove it.

Since mq == rq-based now that legacy is gone, get rid of the
queue_is_rq_based() and just use queue_is_mq() everywhere.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1d185f1fc333..41aaa05e42c1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -656,11 +656,7 @@ static inline bool blk_account_rq(struct request *rq)
 
 #define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
 
-/*
- * Driver can handle struct request, if it either has an old style
- * request_fn defined, or is blk-mq based.
- */
-static inline bool queue_is_rq_based(struct request_queue *q)
+static inline bool queue_is_mq(struct request_queue *q)
 {
 	return q->mq_ops;
 }
-- 
cgit v1.2.3


From 0619317ff8baa2da9238191ad5167ed3618c16d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Nov 2018 21:16:54 -0700
Subject: block: add polled wakeup task helper

If we're polling for IO on a device that doesn't use interrupts, then
IO completion loop (and wake of task) is done by submitting task itself.
If that is the case, then we don't need to enter the wake_up_process()
function, we can simply mark ourselves as TASK_RUNNING.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 41aaa05e42c1..91c44f7a7f62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1772,4 +1772,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 #endif /* CONFIG_BLOCK */
 
+static inline void blk_wake_io_task(struct task_struct *waiter)
+{
+	/*
+	 * If we're polling, the task itself is doing the completions. For
+	 * that case, we don't need to signal a wakeup, it's enough to just
+	 * mark us as RUNNING.
+	 */
+	if (waiter == current)
+		__set_current_state(TASK_RUNNING);
+	else
+		wake_up_process(waiter);
+}
+
 #endif
-- 
cgit v1.2.3


From 2b78eae147a13ab2ca7caa121dd3fca2eecf8613 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:01 +0100
Subject: block: remove the rq_alloc_data request_queue field

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 91c44f7a7f62..1ad6eafc43f2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -567,7 +567,6 @@ struct request_queue {
 	bool			mq_sysfs_init_done;
 
 	size_t			cmd_size;
-	void			*rq_alloc_data;
 
 	struct work_struct	release_work;
 
-- 
cgit v1.2.3


From 85f4d4b65fdd67f1d6dc9eeb1d91923cef07eb6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Nov 2018 13:30:55 -0700
Subject: block: have ->poll_fn() return number of entries polled

We currently only really support sync poll, ie poll with 1 IO in flight.
This prepares us for supporting async poll.

Note that the returned value isn't necessarily 100% accurate. If poll
races with IRQ completion, we assume that the fact that the task is now
runnable means we found at least one entry. In reality it could be more
than 1, or not even 1. This is fine, the caller will just need to take
this into account.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ad6eafc43f2..e97c0a3b2262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
-- 
cgit v1.2.3


From 92f806d678e5136e4777b21e5ed5368482ac9ea9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 11:37:31 -0700
Subject: nvme-fc: remove ->poll implementation

It's specifically looking for a given request, which we will not be
supporting going forward. Also kill the qla2xxx poll implementation
as that's the only user of the nvme-fc poll, and the now unused
->poll_queue() hook.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by:  James Smart <jsmart2021@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme-fc-driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 496ff759f84c..f4ab3b1925ac 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -403,7 +403,6 @@ struct nvme_fc_port_template {
 				void **handle);
 	void	(*delete_queue)(struct nvme_fc_local_port *,
 				unsigned int qidx, void *handle);
-	void	(*poll_queue)(struct nvme_fc_local_port *, void *handle);
 	int	(*ls_req)(struct nvme_fc_local_port *,
 				struct nvme_fc_remote_port *,
 				struct nvmefc_ls_req *);
-- 
cgit v1.2.3


From e2b3fa5af70c1e646270f6c7c799414f5e904d7a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:34 +0900
Subject: block: Remove bio->bi_ioc

bio->bi_ioc is never set so always NULL. Remove references to it in
bio_disassociate_task() and in rq_ioc() and delete this field from
struct bio. With this change, rq_ioc() always returns
current->io_context without the need for a bio argument. Further
simplify the code and make it more readable by also removing this
helper, which also allows to simplify blk_mq_sched_assign_ioc() by
removing its bio argument.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dbdbfbd6a987..c0ba1a038ff3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,10 +174,9 @@ struct bio {
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
-	 * Optional ioc and css associated with this bio.  Put on bio
+	 * Optional css associated with this bio.  Put on bio
 	 * release.  Read comment on top of bio_associate_current().
 	 */
-	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
 	struct bio_issue	bi_issue;
-- 
cgit v1.2.3


From 64845a1ddd655574886eb48e9a5eaeeb9b05bf0d Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:35 +0900
Subject: block: Introduce get_current_ioprio()

Define get_current_ioprio() as an inline helper to obtain the caller
I/O priority from its task I/O context. Use this helper in
blk_init_request_from_bio() to set a request ioprio.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 9e30ed6443db..e9bfe6972aed 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -70,6 +70,19 @@ static inline int task_nice_ioclass(struct task_struct *task)
 		return IOPRIO_CLASS_BE;
 }
 
+/*
+ * If the calling process has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ */
+static inline int get_current_ioprio(void)
+{
+	struct io_context *ioc = current->io_context;
+
+	if (ioc)
+		return ioc->ioprio;
+	return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+}
+
 /*
  * For inheritance, return the highest of the two given priorities
  */
-- 
cgit v1.2.3


From 20578bdfd0418efb11ec316229e670d085cd574a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:38 +0900
Subject: block: Initialize BIO I/O priority early

For the synchronous I/O path case (read(), write() etc system calls), a
BIO I/O priority is not initialized until the execution of
blk_init_request_from_bio() when the BIO is submitted and a request
initialized for the BIO execution. This is due to the ki_ioprio field of
the struct kiocb defined on stack being always initialized to
IOPRIO_CLASS_NONE, regardless of the calling process I/O context ioprio
value set with ioprio_set(). This late initialization can result in the
BIO being merged to pending requests even when the I/O priorities
differ.

Fix this by initializing the ki_iopriority field of on stack struct
kiocb using the get_current_ioprio() helper, ensuring that all BIOs
allocated and submitted for the system call execution see the correct
intended I/O priority early. With this, since a BIO I/O priority is
always set to the intended effective value for both the sync and async
path, blk_init_request_from_bio() can be simplified.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c95c0807471f..a1ab233e6469 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2021,7 +2021,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 		.ki_filp = filp,
 		.ki_flags = iocb_flags(filp),
 		.ki_hint = ki_hint_validate(file_write_hint(filp)),
-		.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0),
+		.ki_ioprio = get_current_ioprio(),
 	};
 }
 
-- 
cgit v1.2.3


From 1db4909e76f64a85f4aaa187f0f683f5c85a471d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 20 Nov 2018 09:44:35 +0800
Subject: blk-mq: not embed .mq_kobj and ctx->kobj into queue instance

Even though .mq_kobj, ctx->kobj and q->kobj share same lifetime
from block layer's view, actually they don't because userspace may
grab one kobject anytime via sysfs.

This patch fixes the issue by the following approach:

1) introduce 'struct blk_mq_ctxs' for holding .mq_kobj and managing
all ctxs

2) free all allocated ctxs and the 'blk_mq_ctxs' instance in release
handler of .mq_kobj

3) grab one ref of .mq_kobj before initializing each ctx->kobj, so that
.mq_kobj is always released after all ctxs are freed.

This patch fixes kernel panic issue during booting when DEBUG_KOBJECT_RELEASE
is enabled.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e97c0a3b2262..9b53db06ad08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -456,7 +456,7 @@ struct request_queue {
 	/*
 	 * mq queue kobject
 	 */
-	struct kobject mq_kobj;
+	struct kobject *mq_kobj;
 
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;
-- 
cgit v1.2.3


From 1052b8ac5282daf35df331edcbdb645839d17e6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:21:49 -0700
Subject: blk-mq: when polling for IO, look for any completion

If we want to support async IO polling, then we have to allow finding
completions that aren't just for the one we are looking for. Always pass
in -1 to the mq_ops->poll() helper, and have that return how many events
were found in this poll loop.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b53db06ad08..f3015e9b5ae3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie);
+int blk_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.2.3


From 9743139c5d11ab170f70a308dcb88c342390adfb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 Nov 2018 09:48:21 -0700
Subject: blk-mq: remove 'tag' parameter from mq_ops->poll()

We always pass in -1 now and none of the callers use the tag value,
remove the parameter.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 929e8abc5535..ca0520ca6437 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -132,7 +132,7 @@ typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
-typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (poll_fn)(struct blk_mq_hw_ctx *);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
 typedef void (complete_fn)(struct request *);
-- 
cgit v1.2.3


From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:24:43 -0700
Subject: block: make blk_poll() take a parameter on whether to spin or not

blk_poll() has always kept spinning until it found an IO. This is
fine for SYNC polling, since we need to find one request we have
pending, but in preparation for ASYNC polling it can be beneficial
to just check if we have any entries available or not.

Existing callers are converted to pass in 'spin == true', to retain
the old behavior.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3015e9b5ae3..e3c0a8ec16a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
@@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-int blk_poll(struct request_queue *q, blk_qc_t cookie);
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.2.3


From 16c15eb16a793f2d81ae52f41f43fb6831b34212 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:28 -0700
Subject: blk-mq: Return true if request was completed

A driver may have internal state to cleanup if we're pretending a request
didn't complete. Return 'false' if the command wasn't actually completed
due to the timeout error injection, and true otherwise.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ca0520ca6437..6e3da356a8eb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -298,7 +298,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
-void blk_mq_complete_request(struct request *rq);
+bool blk_mq_complete_request(struct request *rq);
 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
 			   struct bio *bio);
 bool blk_mq_queue_stopped(struct request_queue *q);
-- 
cgit v1.2.3


From af78ff7c6e66832afcdf5418f67b11c409f9e7a1 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:30 -0700
Subject: blk-mq: Simplify request completion state

There are no more users relying on blk-mq request states to prevent
double completions, so replace the relatively expensive cmpxchg operation
with WRITE_ONCE.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6e3da356a8eb..b8de11e0603b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -329,20 +329,6 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
-/**
- * blk_mq_mark_complete() - Set request state to complete
- * @rq: request to set to complete state
- *
- * Returns true if request state was successfully set to complete. If
- * successful, the caller is responsibile for seeing this request is ended, as
- * blk_mq_complete_request will not work again.
- */
-static inline bool blk_mq_mark_complete(struct request *rq)
-{
-	return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
-			MQ_RQ_IN_FLIGHT;
-}
-
 /*
  * Driver command data is immediately after the request. So subtract request
  * size to get back to the original request, add request size to get the PDU.
-- 
cgit v1.2.3


From 5f0ed774ed2914decfd397569fface997532e94d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Nov 2018 22:04:33 -0700
Subject: block: sum requests in the plug structure

This isn't exactly the same as the previous count, as it includes
requests for all devices. But that really doesn't matter, if we have
more than the threshold (16) queued up, flush it. It's not worth it
to have an expensive list loop for this.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e3c0a8ec16a7..02732cae6080 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1130,6 +1130,7 @@ extern void blk_set_queue_dying(struct request_queue *);
 struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
+	unsigned short rq_count;
 };
 #define BLK_MAX_REQUEST_COUNT 16
 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)
-- 
cgit v1.2.3


From 94a2c3a32b62e868dc1e3d854326745a7f1b8c7a Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Wed, 28 Nov 2018 16:42:01 +0800
Subject: block: use rcu_work instead of call_rcu to avoid sleep in softirq

We recently got a stack by syzkaller like this:

BUG: sleeping function called from invalid context at mm/slab.h:361
in_atomic(): 1, irqs_disabled(): 0, pid: 6644, name: blkid
INFO: lockdep is turned off.
CPU: 1 PID: 6644 Comm: blkid Not tainted 4.4.163-514.55.6.9.x86_64+ #76
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
 0000000000000000 5ba6a6b879e50c00 ffff8801f6b07b10 ffffffff81cb2194
 0000000041b58ab3 ffffffff833c7745 ffffffff81cb2080 5ba6a6b879e50c00
 0000000000000000 0000000000000001 0000000000000004 0000000000000000
Call Trace:
 <IRQ>  [<ffffffff81cb2194>] __dump_stack lib/dump_stack.c:15 [inline]
 <IRQ>  [<ffffffff81cb2194>] dump_stack+0x114/0x1a0 lib/dump_stack.c:51
 [<ffffffff8129a981>] ___might_sleep+0x291/0x490 kernel/sched/core.c:7675
 [<ffffffff8129ac33>] __might_sleep+0xb3/0x270 kernel/sched/core.c:7637
 [<ffffffff81794c13>] slab_pre_alloc_hook mm/slab.h:361 [inline]
 [<ffffffff81794c13>] slab_alloc_node mm/slub.c:2610 [inline]
 [<ffffffff81794c13>] slab_alloc mm/slub.c:2692 [inline]
 [<ffffffff81794c13>] kmem_cache_alloc_trace+0x2c3/0x5c0 mm/slub.c:2709
 [<ffffffff81cbe9a7>] kmalloc include/linux/slab.h:479 [inline]
 [<ffffffff81cbe9a7>] kzalloc include/linux/slab.h:623 [inline]
 [<ffffffff81cbe9a7>] kobject_uevent_env+0x2c7/0x1150 lib/kobject_uevent.c:227
 [<ffffffff81cbf84f>] kobject_uevent+0x1f/0x30 lib/kobject_uevent.c:374
 [<ffffffff81cbb5b9>] kobject_cleanup lib/kobject.c:633 [inline]
 [<ffffffff81cbb5b9>] kobject_release+0x229/0x440 lib/kobject.c:675
 [<ffffffff81cbb0a2>] kref_sub include/linux/kref.h:73 [inline]
 [<ffffffff81cbb0a2>] kref_put include/linux/kref.h:98 [inline]
 [<ffffffff81cbb0a2>] kobject_put+0x72/0xd0 lib/kobject.c:692
 [<ffffffff8216f095>] put_device+0x25/0x30 drivers/base/core.c:1237
 [<ffffffff81c4cc34>] delete_partition_rcu_cb+0x1d4/0x2f0 block/partition-generic.c:232
 [<ffffffff813c08bc>] __rcu_reclaim kernel/rcu/rcu.h:118 [inline]
 [<ffffffff813c08bc>] rcu_do_batch kernel/rcu/tree.c:2705 [inline]
 [<ffffffff813c08bc>] invoke_rcu_callbacks kernel/rcu/tree.c:2973 [inline]
 [<ffffffff813c08bc>] __rcu_process_callbacks kernel/rcu/tree.c:2940 [inline]
 [<ffffffff813c08bc>] rcu_process_callbacks+0x59c/0x1c70 kernel/rcu/tree.c:2957
 [<ffffffff8120f509>] __do_softirq+0x299/0xe20 kernel/softirq.c:273
 [<ffffffff81210496>] invoke_softirq kernel/softirq.c:350 [inline]
 [<ffffffff81210496>] irq_exit+0x216/0x2c0 kernel/softirq.c:391
 [<ffffffff82c2cd7b>] exiting_irq arch/x86/include/asm/apic.h:652 [inline]
 [<ffffffff82c2cd7b>] smp_apic_timer_interrupt+0x8b/0xc0 arch/x86/kernel/apic/apic.c:926
 [<ffffffff82c2bc25>] apic_timer_interrupt+0xa5/0xb0 arch/x86/entry/entry_64.S:746
 <EOI>  [<ffffffff814cbf40>] ? audit_kill_trees+0x180/0x180
 [<ffffffff8187d2f7>] fd_install+0x57/0x80 fs/file.c:626
 [<ffffffff8180989e>] do_sys_open+0x45e/0x550 fs/open.c:1043
 [<ffffffff818099c2>] SYSC_open fs/open.c:1055 [inline]
 [<ffffffff818099c2>] SyS_open+0x32/0x40 fs/open.c:1050
 [<ffffffff82c299e1>] entry_SYSCALL_64_fastpath+0x1e/0x9a

In softirq context, we call rcu callback function delete_partition_rcu_cb(),
which may allocate memory by kzalloc with GFP_KERNEL flag. If the
allocation cannot be satisfied, it may sleep. However, That is not allowed
in softirq contex.

Although we found this problem on linux 4.4, the latest kernel version
seems to have this problem as well. And it is very similar to the
previous one:
	https://lkml.org/lkml/2018/7/9/391

Fix it by using RCU workqueue, which allows sleep.

Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 70fc838e6773..0c5ee17b4d88 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,7 +129,7 @@ struct hd_struct {
 	struct disk_stats dkstats;
 #endif
 	struct percpu_ref ref;
-	struct rcu_head rcu_head;
+	struct rcu_work rcu_work;
 };
 
 #define GENHD_FL_REMOVABLE			1
-- 
cgit v1.2.3


From ce5b009cff1961137127edf91f44effd0eec8ffd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Nov 2018 17:13:56 -0700
Subject: block: improve logic around when to sort a plug list

Only do it if we have requests for multiple queues in the same
plug.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02732cae6080..08d940f85fa0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1131,6 +1131,7 @@ struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
 	unsigned short rq_count;
+	bool multiple_queues;
 };
 #define BLK_MAX_REQUEST_COUNT 16
 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)
-- 
cgit v1.2.3


From d666ba98f849ad44c4405ecc2180390ebe80f4f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Nov 2018 17:02:25 -0700
Subject: blk-mq: add mq_ops->commit_rqs()

blk-mq passes information to the hardware about any given request being
the last that we will issue in this sequence. The point is that hardware
can defer costly doorbell type writes to the last request. But if we run
into errors issuing a sequence of requests, we may never send the request
with bd->last == true set. For that case, we need a hook that tells the
hardware that nothing else is coming right now.

For failures returned by the drivers ->queue_rq() hook, the driver is
responsible for flushing pending requests, if it uses bd->last to
optimize that part. This works like before, no changes there.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b8de11e0603b..467f1dd21ccf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -117,6 +117,7 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
 /* takes rq->cmd_flags as input, returns a hardware type index */
 typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
@@ -144,6 +145,15 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * If a driver uses bd->last to judge when to submit requests to
+	 * hardware, it must define this function. In case of errors that
+	 * make us stop issuing further requests, this hook serves the
+	 * purpose of kicking the hardware (which the last request otherwise
+	 * would have done).
+	 */
+	commit_rqs_fn		*commit_rqs;
+
 	/*
 	 * Return a queue map type for the given request/bio flags
 	 */
-- 
cgit v1.2.3


From ea86ea2cdced20057da4d2c32965c1219c238197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 30 Nov 2018 13:18:06 -0700
Subject: sbitmap: ammortize cost of clearing bits

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..81359d45751e 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,24 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
-	unsigned long word;
+	unsigned long depth;
 
 	/**
-	 * @depth: Number of bits being used in @word.
+	 * @word: word holding free bits
 	 */
-	unsigned long depth;
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
+
+	/**
+	 * @swap_lock: Held while swapping word <-> cleared
+	 */
+	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
@@ -321,8 +344,6 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
-unsigned int sbitmap_weight(const struct sbitmap *sb);
-
 /**
  * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
  * @sb: Bitmap to show.
-- 
cgit v1.2.3


From 5d2ee7122c73be6a3b6bfe90d237e8aed737cfaa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 17:36:41 -0700
Subject: sbitmap: optimize wakeup check

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 81359d45751e..92806a2dbab7 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -135,6 +135,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -552,4 +557,33 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+struct sbq_wait {
+	int accounted;
+	struct wait_queue_entry wait;
+};
+
+#define DEFINE_SBQ_WAIT(name)							\
+	struct sbq_wait name = {						\
+		.accounted = 0,							\
+		.wait = {							\
+			.private	= current,				\
+			.func		= autoremove_wake_function,		\
+			.entry		= LIST_HEAD_INIT((name).wait.entry),	\
+		}								\
+	}
+
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
-- 
cgit v1.2.3


From 8c2def893afc60d88160d524acf345765cf0c447 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 3 Dec 2018 14:45:43 -0800
Subject: sbitmap: fix sbitmap_for_each_set()

We need to ignore bits in the cleared mask when iterating over all set
bits.

Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits")
Reported-by: Jens Axboe@kernel.dk>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 92806a2dbab7..03f50fcedc79 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -265,12 +265,14 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 	nr = SB_NR_TO_BIT(sb, start);
 
 	while (scanned < sb->depth) {
-		struct sbitmap_word *word = &sb->map[index];
-		unsigned int depth = min_t(unsigned int, word->depth - nr,
+		unsigned long word;
+		unsigned int depth = min_t(unsigned int,
+					   sb->map[index].depth - nr,
 					   sb->depth - scanned);
 
 		scanned += depth;
-		if (!word->word)
+		word = sb->map[index].word & ~sb->map[index].cleared;
+		if (!word)
 			goto next;
 
 		/*
@@ -280,7 +282,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 		 */
 		depth += nr;
 		while (1) {
-			nr = find_next_bit(&word->word, depth, nr);
+			nr = find_next_bit(&word, depth, nr);
 			if (nr >= depth)
 				break;
 			if (!fn(sb, (index << sb->shift) + nr, data))
-- 
cgit v1.2.3


From e20ba6e1da029136ded295f33076483d65ddf50a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:16 +0100
Subject: block: move queues types to the block layer

Having another indirect all in the fast path doesn't really help
in our post-spectre world.  Also having too many queue type is just
going to create confusion, so I'd rather manage them centrally.

Note that the queue type naming and ordering changes a bit - the
first index now is the default queue for everything not explicitly
marked, the optional ones are read and poll queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 467f1dd21ccf..57eda7b20243 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -81,8 +81,12 @@ struct blk_mq_queue_map {
 	unsigned int queue_offset;
 };
 
-enum {
-	HCTX_MAX_TYPES = 3,
+enum hctx_type {
+	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
+	HCTX_TYPE_READ,		/* just for READ I/O */
+	HCTX_TYPE_POLL,		/* polled I/O of any kind */
+
+	HCTX_MAX_TYPES,
 };
 
 struct blk_mq_tag_set {
@@ -118,8 +122,6 @@ struct blk_mq_queue_data {
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
 typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
-/* takes rq->cmd_flags as input, returns a hardware type index */
-typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
@@ -154,11 +156,6 @@ struct blk_mq_ops {
 	 */
 	commit_rqs_fn		*commit_rqs;
 
-	/*
-	 * Return a queue map type for the given request/bio flags
-	 */
-	rq_flags_to_type_fn	*rq_flags_to_type;
-
 	/*
 	 * Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
-- 
cgit v1.2.3


From 529262d56dbebe6a26df5d2fd24cc0e1bc8579e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:26 +0100
Subject: block: remove ->poll_fn

This was intended to support users like nvme multipath, but is just
getting in the way and adding another indirect call.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 08d940f85fa0..0b3874bdbc6a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
@@ -401,7 +400,6 @@ struct request_queue {
 	struct rq_qos		*rq_qos;
 
 	make_request_fn		*make_request_fn;
-	poll_q_fn		*poll_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
 	const struct blk_mq_ops	*mq_ops;
-- 
cgit v1.2.3


From 6e0de61107f03c3222550d9b548cd331d31d82d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 5 Dec 2018 06:50:40 -0700
Subject: blk-mq: remove QUEUE_FLAG_POLL from default MQ flags

We only support polling if we have poll queues now, but the flag is
being set by default. Remove the default QUEUE_FLAG_POLL setting, we'll
set it in blk_mq_init_allocated_queue() if we have poll queues available
for this device.

Fixes: 6544d229bf43 ("block: enable polling by default if a poll map is initalized")
Reported-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0b3874bdbc6a..81f1b105946b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -606,8 +606,7 @@ struct request_queue {
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
-				 (1 << QUEUE_FLAG_POLL))
+				 (1 << QUEUE_FLAG_SAME_COMP))
 
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
-- 
cgit v1.2.3


From 0fe061b9f03c27d0370888efc22d4b3ac7af90cf Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:26 -0500
Subject: blkcg: fix ref count issue with bio_blkcg() using task_css

The bio_blkcg() function turns out to be inconsistent and consequently
dangerous to use. The first part returns a blkcg where a reference is
owned by the bio meaning it does not need to be rcu protected. However,
the third case, the last line, is problematic:

	return css_to_blkcg(task_css(current, io_cgrp_id));

This can race against task migration and the cgroup dying. It is also
semantically different as it must be called rcu protected and is
susceptible to failure when trying to get a reference to it.

This patch adds association ahead of calling bio_blkcg() rather than
after. This makes association a required and explicit step along the
code paths for calling bio_blkcg(). In blk-iolatency, association is
moved above the bio_blkcg() call to ensure it will not return %NULL.

BFQ uses the old bio_blkcg() function, but I do not want to address it
in this series due to the complexity. I have created a private version
documenting the inconsistency and noting not to use it.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 98 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 90 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a9e2e2037129..f619307171a6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -227,22 +227,103 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	css = kthread_blkcg();
+	if (css)
+		return css;
+	return task_css(current, io_cgrp_id);
+}
+
+/**
+ * blkcg_get_css - find and get a reference to the css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This takes a reference on the blkcg which will need to be managed by the
+ * caller.
+ */
+static inline struct cgroup_subsys_state *blkcg_get_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+
+	css = kthread_blkcg();
+	if (css) {
+		css_get(css);
+	} else {
+		/*
+		 * This is a bit complicated.  It is possible task_css() is
+		 * seeing an old css pointer here.  This is caused by the
+		 * current thread migrating away from this cgroup and this
+		 * cgroup dying.  css_tryget() will fail when trying to take a
+		 * ref on a cgroup that's ref count has hit 0.
+		 *
+		 * Therefore, if it does fail, this means current must have
+		 * been swapped away already and this is waiting for it to
+		 * propagate on the polling cpu.  Hence the use of cpu_relax().
+		 */
+		while (true) {
+			css = task_css(current, io_cgrp_id);
+			if (likely(css_tryget(css)))
+				break;
+			cpu_relax();
+		}
+	}
+
+	rcu_read_unlock();
+
+	return css;
+}
 
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *bio_blkcg(struct bio *bio)
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use.  The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio.  This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it.  However, the latter potentially gets
+ * it from task_css().  This can race against task migration and the cgroup
+ * dying.  It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
 {
-	struct cgroup_subsys_state *css;
+	if (bio && bio->bi_css)
+		return css_to_blkcg(bio->bi_css);
+	return css_to_blkcg(blkcg_css());
+}
 
+/**
+ * bio_blkcg - grab the blkcg associated with a bio
+ * @bio: target bio
+ *
+ * This returns the blkcg associated with a bio, %NULL if not associated.
+ * Callers are expected to either handle %NULL or know association has been
+ * done prior to calling this.
+ */
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
 	if (bio && bio->bi_css)
 		return css_to_blkcg(bio->bi_css);
-	css = kthread_blkcg();
-	if (css)
-		return css_to_blkcg(css);
-	return css_to_blkcg(task_css(current, io_cgrp_id));
+	return NULL;
 }
 
 static inline bool blk_cgroup_congested(void)
@@ -710,10 +791,10 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	bool throtl = false;
 
 	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
 
 	/* associate blkcg if bio hasn't attached one */
-	bio_associate_blkcg(bio, &blkcg->css);
+	bio_associate_blkcg(bio, NULL);
+	blkcg = bio_blkcg(bio);
 
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
@@ -835,6 +916,7 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 static inline void blkcg_deactivate_policy(struct request_queue *q,
 					   const struct blkcg_policy *pol) { }
 
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 
 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-- 
cgit v1.2.3


From b978962ad4f7f9c06e5aa07b2a9b22f6d600456c Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:27 -0500
Subject: blkcg: update blkg_lookup_create() to do locking

To know when to create a blkg, the general pattern is to do a
blkg_lookup() and if that fails, lock and do the lookup again, and if
that fails finally create. It doesn't make much sense for everyone who
wants to do creation to write this themselves.

This changes blkg_lookup_create() to do locking and implement this
pattern. The old blkg_lookup_create() is renamed to
__blkg_lookup_create().  If a call site wants to do its own error
handling or already owns the queue lock, they can use
__blkg_lookup_create(). This will be used in upcoming patches.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f619307171a6..b3b1a8187d23 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -181,6 +181,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+				      struct request_queue *q);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -799,7 +801,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(&q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
+		blkg = __blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(&q->queue_lock);
-- 
cgit v1.2.3


From beea9da07d8a6228a7e4a31a83f9478d513bf03f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:28 -0500
Subject: blkcg: convert blkg_lookup_create() to find closest blkg

There are several scenarios where blkg_lookup_create() can fail such as
the blkcg dying, request_queue is dying, or simply being OOM. Most
handle this by simply falling back to the q->root_blkg and calling it a
day.

This patch implements the notion of closest blkg. During
blkg_lookup_create(), if it fails to create, return the closest blkg
found or the q->root_blkg. blkg_try_get_closest() is introduced and used
during association so a bio is always attached to a blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b3b1a8187d23..c08e96e521ed 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -545,6 +545,20 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
 	return NULL;
 }
 
+/**
+ * blkg_try_get_closest - try and get a blkg ref on the closet blkg
+ * @blkg: blkg to get
+ *
+ * This walks up the blkg tree to find the closest non-dying blkg and returns
+ * the blkg that it did association with as it may not be the passed in blkg.
+ */
+static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
+{
+	while (!atomic_inc_not_zero(&blkg->refcnt))
+		blkg = blkg->parent;
+
+	return blkg;
+}
 
 void __blkg_release_rcu(struct rcu_head *rcu);
 
@@ -797,15 +811,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	/* associate blkcg if bio hasn't attached one */
 	bio_associate_blkcg(bio, NULL);
 	blkcg = bio_blkcg(bio);
-
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg)) {
-		spin_lock_irq(&q->queue_lock);
-		blkg = __blkg_lookup_create(blkcg, q);
-		if (IS_ERR(blkg))
-			blkg = NULL;
-		spin_unlock_irq(&q->queue_lock);
-	}
+	blkg = blkg_lookup_create(blkcg, q);
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 
-- 
cgit v1.2.3


From 2268c0feb0ffb1c1bb6e1d4d5505d30f485aa77b Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:29 -0500
Subject: blkcg: introduce common blkg association logic

There are 3 ways blkg association can happen: association with the
current css, with the page css (swap), or from the wbc css (writeback).

This patch handles how association is done for the first case where we
are associating bsaed on the current css. If there is already a blkg
associated, the css will be reused and association will be redone as the
request_queue may have changed.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 056fb627edb3..62715a5a4f32 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -511,12 +511,15 @@ static inline int bio_associate_blkcg_from_page(struct bio *bio,
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
+void bio_disassociate_blkg(struct bio *bio);
+void bio_associate_blkg(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
+static inline void bio_disassociate_blkg(struct bio *bio) { }
+static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }
-- 
cgit v1.2.3


From 5cdf2e3fea5ee37b66842d76a9b06e6dac0b933d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:31 -0500
Subject: blkcg: associate blkg when associating a device

Previously, blkg association was handled by controller specific code in
blk-throttle and blk-iolatency. However, because a blkg represents a
relationship between a blkcg and a request_queue, it makes sense to keep
the blkg->q and bio->bi_disk->queue consistent.

This patch moves association into the bio_set_dev macro(). This should
cover the majority of cases where the device is set/changed keeping the
two pointers consistent. Fallback code is added to
blkcg_bio_issue_check() to catch any missing paths.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h        |  2 ++
 include/linux/blk-cgroup.h | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 62715a5a4f32..6ee2ea8b378a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -491,12 +491,14 @@ do {						\
 		bio_clear_flag(bio, BIO_THROTTLED);\
 	(bio)->bi_disk = (bdev)->bd_disk;	\
 	(bio)->bi_partno = (bdev)->bd_partno;	\
+	bio_associate_blkg(bio);		\
 } while (0)
 
 #define bio_copy_dev(dst, src)			\
 do {						\
 	(dst)->bi_disk = (src)->bi_disk;	\
 	(dst)->bi_partno = (src)->bi_partno;	\
+	bio_clone_blkcg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index c08e96e521ed..f09752968c2a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 #include <linux/kthread.h>
+#include <linux/fs.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -802,21 +803,23 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio)
 {
-	struct blkcg *blkcg;
 	struct blkcg_gq *blkg;
 	bool throtl = false;
 
-	rcu_read_lock();
+	if (!bio->bi_blkg) {
+		char b[BDEVNAME_SIZE];
+
+		WARN_ONCE(1,
+			  "no blkg associated for bio on block-device: %s\n",
+			  bio_devname(bio, b));
+		bio_associate_blkg(bio);
+	}
 
-	/* associate blkcg if bio hasn't attached one */
-	bio_associate_blkcg(bio, NULL);
-	blkcg = bio_blkcg(bio);
-	blkg = blkg_lookup_create(blkcg, q);
+	blkg = bio->bi_blkg;
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 
 	if (!throtl) {
-		blkg = blkg ?: q->root_blkg;
 		/*
 		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 		 * is a split bio and we would have already accounted for the
@@ -828,7 +831,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
-	rcu_read_unlock();
 	return !throtl;
 }
 
-- 
cgit v1.2.3


From e439bedf6b24264f620cc05627e23a90054bde41 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:32 -0500
Subject: blkcg: consolidate bio_issue_init() to be a part of core

bio_issue_init among other things initializes the timestamp for an IO.
Rather than have this logic handled by policies, this consolidates it to
be on the init paths (normal, clone, bounce clone).

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f09752968c2a..8b069c3775ee 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -800,6 +800,12 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 				  struct bio *bio) { return false; }
 #endif
 
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio)
 {
@@ -831,6 +837,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
+	blkcg_bio_issue_init(bio);
+
 	return !throtl;
 }
 
@@ -936,6 +944,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
 
-- 
cgit v1.2.3


From 6a7f6d86a561473032287c8e4583eac5853c6efa Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:33 -0500
Subject: blkcg: associate a blkg for pages being evicted by swap

A prior patch in this series added blkg association to bios issued by
cgroups. There are two other paths that we want to attribute work back
to the appropriate cgroup: swap and writeback. Here we modify the way
swap tags bios to include the blkg. Writeback will be tackle in the next
patch.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6ee2ea8b378a..f13572c254a7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -505,10 +505,10 @@ do {						\
 	disk_devt((bio)->bi_disk)
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
+void bio_associate_blkg_from_page(struct bio *bio, struct page *page);
 #else
-static inline int bio_associate_blkcg_from_page(struct bio *bio,
-						struct page *page) {  return 0; }
+static inline void bio_associate_blkg_from_page(struct bio *bio,
+						struct page *page) { }
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From fd42df305f804ddc0d5ac028e944784283b2f92d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:34 -0500
Subject: blkcg: associate writeback bios with a blkg

One of the goals of this series is to remove a separate reference to
the css of the bio. This can and should be accessed via bio_blkcg(). In
this patch, wbc_init_bio() now requires a bio to have a device
associated with it.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h       | 5 +++++
 include/linux/writeback.h | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f13572c254a7..f0438061a5a3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -515,6 +515,8 @@ static inline void bio_associate_blkg_from_page(struct bio *bio,
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
+void bio_associate_blkg_from_css(struct bio *bio,
+				 struct cgroup_subsys_state *css);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
@@ -522,6 +524,9 @@ static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline void bio_disassociate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg(struct bio *bio) { }
+static inline void bio_associate_blkg_from_css(struct bio *bio,
+					       struct cgroup_subsys_state *css)
+{ }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fdfd04e348f6..738a0c24874f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
  *
  * @bio is a part of the writeback in progress controlled by @wbc.  Perform
  * writeback specific initialization.  This is used to apply the cgroup
- * writeback context.
+ * writeback context.  Must be called after the bio has been associated with
+ * a device.
  */
 static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
@@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 	 * regular writeback instead of writing things out itself.
 	 */
 	if (wbc->wb)
-		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+		bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
-- 
cgit v1.2.3


From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:35 -0500
Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg

Prior patches ensured that any bio that interacts with a request_queue
is properly associated with a blkg. This makes bio->bi_css unnecessary
as blkg maintains a reference to blkcg already.

This removes the bio field bi_css and transfers corresponding uses to
access via bi_blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h        | 11 ++++-------
 include/linux/blk-cgroup.h |  8 ++++----
 include/linux/blk_types.h  |  7 ++++---
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f0438061a5a3..84e1c4dc703a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -498,7 +498,7 @@ do {						\
 do {						\
 	(dst)->bi_disk = (src)->bi_disk;	\
 	(dst)->bi_partno = (src)->bi_partno;	\
-	bio_clone_blkcg_association(dst, src);	\
+	bio_clone_blkg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
@@ -512,24 +512,21 @@ static inline void bio_associate_blkg_from_page(struct bio *bio,
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_disassociate_task(struct bio *bio);
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
+void bio_clone_blkg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
-static inline int bio_associate_blkcg(struct bio *bio,
-			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline void bio_disassociate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
 					       struct cgroup_subsys_state *css)
 { }
 static inline void bio_disassociate_task(struct bio *bio) { }
-static inline void bio_clone_blkcg_association(struct bio *dst,
-			struct bio *src) { }
+static inline void bio_clone_blkg_association(struct bio *dst,
+					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_HIGHMEM
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 8b069c3775ee..f11c37f8ce09 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -309,8 +309,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
  */
 static inline struct blkcg *__bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return css_to_blkcg(blkcg_css());
 }
 
@@ -324,8 +324,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio)
  */
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return NULL;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c0ba1a038ff3..46c005d601ac 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,10 +174,11 @@ struct bio {
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
-	 * Optional css associated with this bio.  Put on bio
-	 * release.  Read comment on top of bio_associate_current().
+	 * Represents the association of the css and request_queue for the bio.
+	 * If a bio goes direct to device, it will not have a blkg as it will
+	 * not have a request_queue associated with it.  The reference is put
+	 * on release of the bio.
 	 */
-	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
 	struct bio_issue	bi_issue;
 #endif
-- 
cgit v1.2.3


From fc5a828bfad628c1092194f2814604943561c52d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:36 -0500
Subject: blkcg: remove additional reference to the css

The previous patch in this series removed carrying around a pointer to
the css in blkg. However, the blkg association logic still relied on
taking a reference on the css to ensure we wouldn't fail in getting a
reference for the blkg.

Here the implicit dependency on the css is removed. The association
continues to rely on the tryget logic walking up the blkg tree. This
streamlines the three ways that association can happen: normal, swap,
and writeback.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 41 -----------------------------------------
 include/linux/cgroup.h     |  2 ++
 2 files changed, 2 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f11c37f8ce09..284819a4d122 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -247,47 +247,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void)
 	return task_css(current, io_cgrp_id);
 }
 
-/**
- * blkcg_get_css - find and get a reference to the css
- *
- * Find the css associated with either the kthread or the current task.
- * This takes a reference on the blkcg which will need to be managed by the
- * caller.
- */
-static inline struct cgroup_subsys_state *blkcg_get_css(void)
-{
-	struct cgroup_subsys_state *css;
-
-	rcu_read_lock();
-
-	css = kthread_blkcg();
-	if (css) {
-		css_get(css);
-	} else {
-		/*
-		 * This is a bit complicated.  It is possible task_css() is
-		 * seeing an old css pointer here.  This is caused by the
-		 * current thread migrating away from this cgroup and this
-		 * cgroup dying.  css_tryget() will fail when trying to take a
-		 * ref on a cgroup that's ref count has hit 0.
-		 *
-		 * Therefore, if it does fail, this means current must have
-		 * been swapped away already and this is waiting for it to
-		 * propagate on the polling cpu.  Hence the use of cpu_relax().
-		 */
-		while (true) {
-			css = task_css(current, io_cgrp_id);
-			if (likely(css_tryget(css)))
-				break;
-			cpu_relax();
-		}
-	}
-
-	rcu_read_unlock();
-
-	return css;
-}
-
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -93,6 +93,8 @@ extern struct css_set init_css_set;
 
 bool css_has_online_children(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
+					 struct cgroup_subsys *ss);
 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
 					     struct cgroup_subsys *ss);
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
-- 
cgit v1.2.3


From 6f70fb66182b02e50deea65e9a3a86b7bf659a39 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:37 -0500
Subject: blkcg: remove bio_disassociate_task()

Now that a bio only holds a blkg reference, so clean up is simply
putting back that reference. Remove bio_disassociate_task() as it just
calls bio_disassociate_blkg() and call the latter directly.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 84e1c4dc703a..7380b094dcca 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -516,7 +516,6 @@ void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
-void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline void bio_disassociate_blkg(struct bio *bio) { }
@@ -524,7 +523,6 @@ static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
 					       struct cgroup_subsys_state *css)
 { }
-static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
-- 
cgit v1.2.3


From 7fcf2b033b84e261dca283bc2911aaea4b07b525 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:38 -0500
Subject: blkcg: change blkg reference counting to use percpu_ref

Every bio is now associated with a blkg putting blkg_get, blkg_try_get,
and blkg_put on the hot path. Switch over the refcnt in blkg to use
percpu_ref.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 284819a4d122..d19ef15a673d 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -124,7 +124,7 @@ struct blkcg_gq {
 	struct blkcg_gq			*parent;
 
 	/* reference count */
-	atomic_t			refcnt;
+	struct percpu_ref		refcnt;
 
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
@@ -487,8 +487,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
  */
 static inline void blkg_get(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	atomic_inc(&blkg->refcnt);
+	percpu_ref_get(&blkg->refcnt);
 }
 
 /**
@@ -500,7 +499,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
 {
-	if (atomic_inc_not_zero(&blkg->refcnt))
+	if (percpu_ref_tryget(&blkg->refcnt))
 		return blkg;
 	return NULL;
 }
@@ -514,23 +513,19 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
 {
-	while (!atomic_inc_not_zero(&blkg->refcnt))
+	while (!percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;
 
 	return blkg;
 }
 
-void __blkg_release_rcu(struct rcu_head *rcu);
-
 /**
  * blkg_put - put a blkg reference
  * @blkg: blkg to put
  */
 static inline void blkg_put(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	if (atomic_dec_and_test(&blkg->refcnt))
-		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+	percpu_ref_put(&blkg->refcnt);
 }
 
 /**
-- 
cgit v1.2.3


From 7754f669ffde3919e398a9e591cd7510d6cf4e73 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:39 -0500
Subject: blkcg: rename blkg_try_get() to blkg_tryget()

blkg reference counting now uses percpu_ref rather than atomic_t. Let's
make this consistent with css_tryget. This renames blkg_try_get to
blkg_tryget and now returns a bool rather than the blkg or %NULL.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index d19ef15a673d..752de1becb5c 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -491,27 +491,25 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 }
 
 /**
- * blkg_try_get - try and get a blkg reference
+ * blkg_tryget - try and get a blkg reference
  * @blkg: blkg to get
  *
  * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
  * of freeing this blkg, so we can only use it if the refcnt is not zero.
  */
-static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
 {
-	if (percpu_ref_tryget(&blkg->refcnt))
-		return blkg;
-	return NULL;
+	return percpu_ref_tryget(&blkg->refcnt);
 }
 
 /**
- * blkg_try_get_closest - try and get a blkg ref on the closet blkg
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
  * @blkg: blkg to get
  *
  * This walks up the blkg tree to find the closest non-dying blkg and returns
  * the blkg that it did association with as it may not be the passed in blkg.
  */
-static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
+static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
 {
 	while (!percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;
-- 
cgit v1.2.3


From 4705de735b3383792c84a92e57508d6865caa85f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Thu, 6 Dec 2018 12:49:38 -0500
Subject: blkcg: put back rcu lock in blkcg_bio_issue_check()

I was a little overzealous in removing the rcu_read_lock() call from
blkcg_bio_issue_check() and it broke blk-throttle. Put it back.

Fixes: e35403a034bf ("blkcg: associate blkg when associating a device")
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 752de1becb5c..bf13ecb0fe4f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -764,6 +764,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	struct blkcg_gq *blkg;
 	bool throtl = false;
 
+	rcu_read_lock();
+
 	if (!bio->bi_blkg) {
 		char b[BDEVNAME_SIZE];
 
@@ -791,6 +793,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	blkcg_bio_issue_init(bio);
 
+	rcu_read_unlock();
 	return !throtl;
 }
 
-- 
cgit v1.2.3


From 12b2117161ddbdcdb69777404c5aa2a9fe6ad7d5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:12 -0700
Subject: nvme: introduce ctrl attributes enumeration

We are growing more controller attributes, so use a proper enumeration
for it.  For now just add the 128-bit hostid which we support.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 818dbe9331be..753c83a5c01f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -198,6 +198,10 @@ enum {
 	NVME_PS_FLAGS_NON_OP_STATE	= 1 << 1,
 };
 
+enum nvme_ctrl_attr {
+	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
+};
+
 struct nvme_id_ctrl {
 	__le16			vid;
 	__le16			ssvid;
-- 
cgit v1.2.3


From 6e3ca03ee934572d5de4fb2224c01e12c4d422c8 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:15 -0700
Subject: nvme: support traffic based keep-alive

If the controller supports traffic based keep alive, we restart the keep
alive timer if any admin or io commands was completed during the kato
period.  This prevents a possible starvation of keep alive commands in
the presence of heavy traffic as in such case, we already have a health
indication from the host perspective.

Only set a comp_seen indicator in case the controller supports keep
alive to minimize the overhead for pci controllers.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 753c83a5c01f..429c4cf90899 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -200,6 +200,7 @@ enum {
 
 enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
+	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 };
 
 struct nvme_id_ctrl {
-- 
cgit v1.2.3


From 7114ddeb40c0ccc584d86df598da4054ca4cd79f Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:34 -0800
Subject: nvmet: change aen mask functions to use bit numbers

Functions nvmet_aen_disabled and nvmet_clear_aen were using
values not bit numbers ie 1 << 9 not 9 for bit function clear_bit
and test_and_set_bit.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 429c4cf90899..d6cfa194be80 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -489,9 +489,15 @@ enum {
 };
 
 enum {
-	NVME_AEN_CFG_NS_ATTR		= 1 << 8,
-	NVME_AEN_CFG_FW_ACT		= 1 << 9,
-	NVME_AEN_CFG_ANA_CHANGE		= 1 << 11,
+	NVME_AEN_BIT_NS_ATTR		= 8,
+	NVME_AEN_BIT_FW_ACT		= 9,
+	NVME_AEN_BIT_ANA_CHANGE		= 11,
+};
+
+enum {
+	NVME_AEN_CFG_NS_ATTR		= 1 << NVME_AEN_BIT_NS_ATTR,
+	NVME_AEN_CFG_FW_ACT		= 1 << NVME_AEN_BIT_FW_ACT,
+	NVME_AEN_CFG_ANA_CHANGE		= 1 << NVME_AEN_BIT_ANA_CHANGE,
 };
 
 struct nvme_lba_range_type {
-- 
cgit v1.2.3


From f301c2b1368905340133ff8ef4485befdd0b7e2d Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:37 -0800
Subject: nvmet: add defines for discovery change async events

Add AEN/AER values as defined by the specification

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d6cfa194be80..77d320d32ee5 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -486,18 +486,21 @@ enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
+	NVME_AER_NOTICE_DISC_CHANGED	= 0xf0,
 };
 
 enum {
 	NVME_AEN_BIT_NS_ATTR		= 8,
 	NVME_AEN_BIT_FW_ACT		= 9,
 	NVME_AEN_BIT_ANA_CHANGE		= 11,
+	NVME_AEN_BIT_DISC_CHANGE	= 31,
 };
 
 enum {
 	NVME_AEN_CFG_NS_ATTR		= 1 << NVME_AEN_BIT_NS_ATTR,
 	NVME_AEN_CFG_FW_ACT		= 1 << NVME_AEN_BIT_FW_ACT,
 	NVME_AEN_CFG_ANA_CHANGE		= 1 << NVME_AEN_BIT_ANA_CHANGE,
+	NVME_AEN_CFG_DISC_CHANGE	= 1 << NVME_AEN_BIT_DISC_CHANGE,
 };
 
 struct nvme_lba_range_type {
-- 
cgit v1.2.3


From 6e2e312ea7ff73acfafaa5c9851e151e9483c761 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 14 Nov 2018 15:57:46 -0800
Subject: nvmet-fc: remove the IN_ISR deferred scheduling options

All target lldd's call the cmd receive and op completions in non-isr
thread contexts. As such the IN_ISR options are not necessary.
Remove the functionality and flags, which also removes cpu assignments
to queues.

Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme-fc-driver.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f4ab3b1925ac..91745cc3704c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -648,22 +648,6 @@ enum {
 		 * sequence in one LLDD operation. Errors during Data
 		 * sequence transmit must not allow RSP sequence to be sent.
 		 */
-	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 1),
-		/* Bit 2: When 0, the LLDD is calling the cmd rcv handler
-		 * in a non-isr context, allowing the transport to finish
-		 * op completion in the calling context. When 1, the LLDD
-		 * is calling the cmd rcv handler in an ISR context,
-		 * requiring the transport to transition to a workqueue
-		 * for op completion.
-		 */
-	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 2),
-		/* Bit 3: When 0, the LLDD is calling the op done handler
-		 * in a non-isr context, allowing the transport to finish
-		 * op completion in the calling context. When 1, the LLDD
-		 * is calling the op done handler in an ISR context,
-		 * requiring the transport to transition to a workqueue
-		 * for op completion.
-		 */
 };
 
 
-- 
cgit v1.2.3


From e6a622fd6d66b83779357e3400f487fc159a7d83 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:12 -0800
Subject: nvmet: support fabrics sq flow control

Technical proposal 8005 "fabrics SQ flow control" introduces a mode
where a host and controller agree to omit sq_head pointer updates
when sending nvme completions.

In case the host indicated desire to operate in this mode (connect attribute)
the controller will return back a connect completion with sq_head value
of 0xffff as indication that it will omit sq_head pointer updates.

This mode saves us an atomic update in the I/O path.

Reviewed-by: Hannes Reinecke <hare@suse.com>
[hch: suggested better implementation]
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 77d320d32ee5..e7d731776f62 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1044,6 +1044,10 @@ struct nvmf_disc_rsp_page_hdr {
 	struct nvmf_disc_rsp_page_entry entries[0];
 };
 
+enum {
+	NVME_CONNECT_DISABLE_SQFLOW	= (1 << 2),
+};
+
 struct nvmf_connect_command {
 	__u8		opcode;
 	__u8		resv1;
-- 
cgit v1.2.3


From 0445e1b5a2fed4612b7f72d9a56889c026b60aa9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:13 -0800
Subject: nvmet: don't override treq upon modification.

Only override the allowed parts of it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
[hch: slight tweak to the NVME_TREQ_SECURE_CHANNEL_MASK definition]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e7d731776f62..4fc48071e5ea 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -61,6 +61,8 @@ enum {
 	NVMF_TREQ_NOT_SPECIFIED	= 0,	/* Not specified */
 	NVMF_TREQ_REQUIRED	= 1,	/* Required */
 	NVMF_TREQ_NOT_REQUIRED	= 2,	/* Not Required */
+#define NVME_TREQ_SECURE_CHANNEL_MASK \
+	(NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED)
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS
-- 
cgit v1.2.3


From 9b95d2fb857f242aacbf4e205656818b0ef067e1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 20 Nov 2018 10:34:19 +0100
Subject: nvmet: expose support for fabrics SQ flow control disable in treq

Technical Proposal introduces an indication for SQ flow control
disable support. Expose it since we are able to operate in this mode.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4fc48071e5ea..c03973c215ad 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -58,11 +58,13 @@ enum {
 
 /* Transport Requirements codes for Discovery Log Page entry TREQ field */
 enum {
-	NVMF_TREQ_NOT_SPECIFIED	= 0,	/* Not specified */
-	NVMF_TREQ_REQUIRED	= 1,	/* Required */
-	NVMF_TREQ_NOT_REQUIRED	= 2,	/* Not Required */
+	NVMF_TREQ_NOT_SPECIFIED	= 0,		/* Not specified */
+	NVMF_TREQ_REQUIRED	= 1,		/* Required */
+	NVMF_TREQ_NOT_REQUIRED	= 2,		/* Not Required */
 #define NVME_TREQ_SECURE_CHANNEL_MASK \
 	(NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED)
+
+	NVMF_TREQ_DISABLE_SQFLOW = (1 << 2),	/* Supports SQ flow control disable */
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS
-- 
cgit v1.2.3


From 49cd84b6f8b677ef45731ed56ddb802cdbb94c9e Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 27 Nov 2018 09:40:57 -0700
Subject: nvme: implement Enhanced Command Retry

A controller may have an internal state that is not able to successfully
process commands for a short duration. In such states, an immediate
command requeue is expected to fail. The driver may exceed its max
retry count, which permanently ends the command in failure when the same
command would succeed after waiting for the controller to be ready.

NVMe ratified TP 4033 provides a delay hint in the completion status
code for failed commands. Implement the retry delay based on the command
completion status and the controller's requested delay.

Note that requeued commands are handled per request_queue, not per
individual request. If multiple commands fail, the controller should
consistently report the desired delay time for retryable commands in
all CQEs, otherwise the requeue list may be kicked too soon.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c03973c215ad..88812cb15be0 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -223,7 +223,11 @@ struct nvme_id_ctrl {
 	__le32			rtd3e;
 	__le32			oaes;
 	__le32			ctratt;
-	__u8			rsvd100[156];
+	__u8			rsvd100[28];
+	__le16			crdt1;
+	__le16			crdt2;
+	__le16			crdt3;
+	__u8			rsvd134[122];
 	__le16			oacs;
 	__u8			acl;
 	__u8			aerl;
@@ -756,6 +760,15 @@ enum {
 	NVME_HOST_MEM_RETURN	= (1 << 1),
 };
 
+struct nvme_feat_host_behavior {
+	__u8 acre;
+	__u8 resv1[511];
+};
+
+enum {
+	NVME_ENABLE_ACRE	= 1,
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
@@ -810,6 +823,7 @@ enum {
 	NVME_FEAT_RRL		= 0x12,
 	NVME_FEAT_PLM_CONFIG	= 0x13,
 	NVME_FEAT_PLM_WINDOW	= 0x14,
+	NVME_FEAT_HOST_BEHAVIOR	= 0x16,
 	NVME_FEAT_SW_PROGRESS	= 0x80,
 	NVME_FEAT_HOST_ID	= 0x81,
 	NVME_FEAT_RESV_MASK	= 0x82,
@@ -1265,6 +1279,7 @@ enum {
 	NVME_SC_ANA_TRANSITION		= 0x303,
 	NVME_SC_HOST_PATH_ERROR		= 0x370,
 
+	NVME_SC_CRD			= 0x1800,
 	NVME_SC_DNR			= 0x4000,
 };
 
-- 
cgit v1.2.3


From 112f158f66cbe25fd561a5dfe9c3826e06abf757 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 6 Dec 2018 11:41:18 -0500
Subject: block: stop passing 'cpu' to all percpu stats methods

All of part_stat_* and related methods are used with preempt disabled,
so there is no need to pass cpu around to allow of them.  Just call
smp_processor_id() as needed.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0c5ee17b4d88..1677cd2a4c4e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -295,8 +295,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 #define part_stat_lock()	({ rcu_read_lock(); get_cpu(); })
 #define part_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
 
-#define __part_stat_add(cpu, part, field, addnd)			\
-	(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
+#define __part_stat_add(part, field, addnd)				\
+	(per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd))
 
 #define part_stat_read(part, field)					\
 ({									\
@@ -333,7 +333,7 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_lock()	({ rcu_read_lock(); 0; })
 #define part_stat_unlock()	rcu_read_unlock()
 
-#define __part_stat_add(cpu, part, field, addnd)				\
+#define __part_stat_add(part, field, addnd)				\
 	((part)->dkstats.field += addnd)
 
 #define part_stat_read(part, field)	((part)->dkstats.field)
@@ -362,19 +362,19 @@ static inline void free_part_stats(struct hd_struct *part)
 	 part_stat_read(part, field[STAT_WRITE]) +			\
 	 part_stat_read(part, field[STAT_DISCARD]))
 
-#define part_stat_add(cpu, part, field, addnd)	do {			\
-	__part_stat_add((cpu), (part), field, addnd);			\
+#define part_stat_add(part, field, addnd)	do {			\
+	__part_stat_add((part), field, addnd);				\
 	if ((part)->partno)						\
-		__part_stat_add((cpu), &part_to_disk((part))->part0,	\
+		__part_stat_add(&part_to_disk((part))->part0,		\
 				field, addnd);				\
 } while (0)
 
-#define part_stat_dec(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, -1)
-#define part_stat_inc(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, 1)
-#define part_stat_sub(cpu, gendiskp, field, subnd)			\
-	part_stat_add(cpu, gendiskp, field, -subnd)
+#define part_stat_dec(gendiskp, field)					\
+	part_stat_add(gendiskp, field, -1)
+#define part_stat_inc(gendiskp, field)					\
+	part_stat_add(gendiskp, field, 1)
+#define part_stat_sub(gendiskp, field, subnd)				\
+	part_stat_add(gendiskp, field, -subnd)
 
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2]);
@@ -399,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part)
 }
 
 /* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part);
+extern void part_round_stats(struct request_queue *q, struct hd_struct *part);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,
-- 
cgit v1.2.3


From 5b18b5a737600fd20ba2045f320d5926ebbf341a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:19 -0500
Subject: block: delete part_round_stats and switch to less precise counting

We want to convert to per-cpu in_flight counters.

The function part_round_stats needs the in_flight counter every jiffy, it
would be too costly to sum all the percpu variables every jiffy, so it
must be deleted. part_round_stats is used to calculate two counters -
time_in_queue and io_ticks.

time_in_queue can be calculated without part_round_stats, by adding the
duration of the I/O when the I/O ends (the value is almost as exact as the
previously calculated value, except that time for in-progress I/Os is not
counted).

io_ticks can be approximated by increasing the value when I/O is started
or ended and the jiffies value has changed. If the I/Os take less than a
jiffy, the value is as exact as the previously calculated value. If the
I/Os take more than a jiffy, io_ticks can drift behind the previously
calculated value.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1677cd2a4c4e..838c2a7a40c5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -398,8 +398,7 @@ static inline void free_part_info(struct hd_struct *part)
 	kfree(part->info);
 }
 
-/* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, struct hd_struct *part);
+void update_io_ticks(struct hd_struct *part, unsigned long now);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,
-- 
cgit v1.2.3


From 1226b8dd0e91331cfab500f305b2c264445a0392 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:20 -0500
Subject: block: switch to per-cpu in-flight counters

Now when part_round_stats is gone, we can switch to per-cpu in-flight
counters.

We use the local-atomic type local_t, so that if part_inc_in_flight or
part_dec_in_flight is reentrantly called from an interrupt, the value will
be correct.

The other counters could be corrupted due to reentrant interrupt, but the
corruption only results in slight counter skew - the in_flight counter
must be exact, so it needs local_t.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 838c2a7a40c5..636b4f687e35 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -17,6 +17,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
 #include <linux/blk_types.h>
+#include <asm/local.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -89,6 +90,7 @@ struct disk_stats {
 	unsigned long merges[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+	local_t in_flight[2];
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
@@ -122,7 +124,6 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -295,8 +296,11 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 #define part_stat_lock()	({ rcu_read_lock(); get_cpu(); })
 #define part_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
 
-#define __part_stat_add(part, field, addnd)				\
-	(per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd))
+#define part_stat_get_cpu(part, field, cpu)					\
+	(per_cpu_ptr((part)->dkstats, (cpu))->field)
+
+#define part_stat_get(part, field)					\
+	part_stat_get_cpu(part, field, smp_processor_id())
 
 #define part_stat_read(part, field)					\
 ({									\
@@ -333,10 +337,9 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_lock()	({ rcu_read_lock(); 0; })
 #define part_stat_unlock()	rcu_read_unlock()
 
-#define __part_stat_add(part, field, addnd)				\
-	((part)->dkstats.field += addnd)
-
-#define part_stat_read(part, field)	((part)->dkstats.field)
+#define part_stat_get(part, field)		((part)->dkstats.field)
+#define part_stat_get_cpu(part, field, cpu)	part_stat_get(part, field)
+#define part_stat_read(part, field)		part_stat_get(part, field)
 
 static inline void part_stat_set_all(struct hd_struct *part, int value)
 {
@@ -362,6 +365,9 @@ static inline void free_part_stats(struct hd_struct *part)
 	 part_stat_read(part, field[STAT_WRITE]) +			\
 	 part_stat_read(part, field[STAT_DISCARD]))
 
+#define __part_stat_add(part, field, addnd)				\
+	(part_stat_get(part, field) += (addnd))
+
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
 	if ((part)->partno)						\
@@ -376,6 +382,15 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(gendiskp, field, subnd)				\
 	part_stat_add(gendiskp, field, -subnd)
 
+#define part_stat_local_dec(gendiskp, field)				\
+	local_dec(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_inc(gendiskp, field)				\
+	local_inc(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_read(gendiskp, field)				\
+	local_read(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_read_cpu(gendiskp, field, cpu)			\
+	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
+
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2]);
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
-- 
cgit v1.2.3


From e016b78201a2d9ff40f3f0da072292689af24c7f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:21 -0500
Subject: block: return just one value from part_in_flight

The previous patches deleted all the code that needed the second value
returned from part_in_flight - now the kernel only uses the first value.

Consequently, part_in_flight (and blk_mq_in_flight) may be changed so that
it only returns one value.

This patch just refactors the code, there's no functional change.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 636b4f687e35..06c0fd594097 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -391,8 +391,7 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_local_read_cpu(gendiskp, field, cpu)			\
 	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
 
-void part_in_flight(struct request_queue *q, struct hd_struct *part,
-		    unsigned int inflight[2]);
+unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part);
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 		       unsigned int inflight[2]);
 void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
-- 
cgit v1.2.3


From 24828d0536bbedc9b265f2b01ffca99de3f6a7c7 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:24 +0100
Subject: lightnvm: dynamic DMA pool entry size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently lightnvm and pblk uses single DMA pool, for which the entry
size always is equal to PAGE_SIZE. The contents of each entry allocated
from the DMA pool consists of a PPA list (8bytes * 64), leaving
56bytes * 64 space for metadata. Since the metadata field can be bigger,
such as 128 bytes, the static size does not cover this use-case.

This patch adds support for I/O metadata above 56 bytes by changing DMA
pool size based on device meta size and allows pblk to use OOB metadata
>=16B.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2fdeac1a420d..7afedaddbd15 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -90,7 +90,7 @@ typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
 							struct nvm_chk_meta *);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
+typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
 								dma_addr_t *);
-- 
cgit v1.2.3


From a16816b9e462e8ee86a908606bde54b53cfeca80 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:25 +0100
Subject: lightnvm: disable interleaved metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently pblk only check the size of I/O metadata and does not take
into account if this metadata is in a separate buffer or interleaved
in a single metadata buffer.

In reality only the first scenario is supported, where second mode will
break pblk functionality during any IO operation.

This patch prevents pblk to be instantiated in case device only
supports interleaved metadata.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7afedaddbd15..5d865a5d5cdc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -357,6 +357,7 @@ struct nvm_geo {
 	u32	clba;		/* sectors per chunk */
 	u16	csecs;		/* sector size */
 	u16	sos;		/* out-of-band area size */
+	bool	ext;		/* metadata in extended data buffer */
 
 	/* device write constrains */
 	u32	ws_min;		/* minimum write size */
-- 
cgit v1.2.3


From 0273ac349f08f4ff9ef88aaaf9c9f2aa6e87d2be Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Tue, 11 Dec 2018 18:03:08 -0500
Subject: blkcg: handle dying request_queue when associating a blkg

Between v3 [1] and v4 [2] of the blkg association series, the
association point moved from generic_make_request_checks(), which is
called after the request enters the queue, to bio_set_dev(), which is when
the bio is formed before submit_bio(). When the request_queue goes away,
the blkgs supporting the request_queue are destroyed and then the
q->root_blkg is set to %NULL.

This patch adds a %NULL check to blkg_tryget_closest() to prevent the
NPE caused by the above. It also adds a guard to see if the
request_queue is dying when creating a blkg to prevent creating a blkg
for a dead request_queue.

[1] https://lore.kernel.org/lkml/20180911184137.35897-1-dennisszhou@gmail.com/
[2] https://lore.kernel.org/lkml/20181126211946.77067-1-dennis@kernel.org/

Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device")
Reported-and-tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bf13ecb0fe4f..f025fd1e22e6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -511,7 +511,7 @@ static inline bool blkg_tryget(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
 {
-	while (!percpu_ref_tryget(&blkg->refcnt))
+	while (blkg && !percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;
 
 	return blkg;
-- 
cgit v1.2.3


From cb002d074dabfaa2248507fd9478d16a542e4f1e Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:07 -0800
Subject: iov_iter: pass void csum pointer to csum_and_copy_to_iter

The single caller to csum_and_copy_to_iter is skb_copy_and_csum_datagram
and we are trying to unite its logic with skb_copy_datagram_iter by passing
a callback to the copy function that we want to apply. Thus, we need
to make the checksum pointer private to the function.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/uio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 55ce99ddb912..41d1f8d3313d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -266,7 +266,7 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
 	i->count = count;
 }
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
-- 
cgit v1.2.3


From d05f443554b3c7dc6d46e3ba9c3c4de468875d4f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:09 -0800
Subject: iov_iter: introduce hash_and_copy_to_iter helper

Allow consumers that want to use iov iterator helpers and also update
a predefined hash calculation online when copying data. This is useful
when copying incoming network buffers to a local iterator and calculate
a digest on the incoming stream. nvme-tcp host driver that will be
introduced in following patches is the first consumer via
skb_copy_and_hash_datagram_iter.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/uio.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 41d1f8d3313d..ecf584f6b82d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/thread_info.h>
+#include <crypto/hash.h>
 #include <uapi/linux/uio.h>
 
 struct page;
@@ -269,6 +270,8 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+		struct iov_iter *i);
 
 int import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
-- 
cgit v1.2.3


From 65d69e2505bb64f6a8d7f417f6e46e2a351174c6 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:10 -0800
Subject: datagram: introduce skb_copy_and_hash_datagram_iter helper

Introduce a helper to copy datagram into an iovec iterator
but also update a predefined hash. This is useful for
consumers of skb_copy_datagram_iter to also support inflight
data digest without having to finish to copy and only then
traverse the iovec and calculate the digest hash.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/skbuff.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0d1b2c3f127b..b96c809c29eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3325,6 +3325,9 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 }
 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 				   struct msghdr *msg);
+int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len,
+			   struct ahash_request *hash);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
-- 
cgit v1.2.3


From fc221d05447aa6db686a6724dd08aa6cce0924d1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:14 -0800
Subject: nvme-tcp: Add protocol header

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme-tcp.h | 189 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h     |   1 +
 2 files changed, 190 insertions(+)
 create mode 100644 include/linux/nvme-tcp.h

(limited to 'include/linux')

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
new file mode 100644
index 000000000000..03d87c0550a9
--- /dev/null
+++ b/include/linux/nvme-tcp.h
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP protocol header.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+
+#ifndef _LINUX_NVME_TCP_H
+#define _LINUX_NVME_TCP_H
+
+#include <linux/nvme.h>
+
+#define NVME_TCP_DISC_PORT	8009
+#define NVME_TCP_ADMIN_CCSZ	SZ_8K
+#define NVME_TCP_DIGEST_LENGTH	4
+
+enum nvme_tcp_pfv {
+	NVME_TCP_PFV_1_0 = 0x0,
+};
+
+enum nvme_tcp_fatal_error_status {
+	NVME_TCP_FES_INVALID_PDU_HDR		= 0x01,
+	NVME_TCP_FES_PDU_SEQ_ERR		= 0x02,
+	NVME_TCP_FES_HDR_DIGEST_ERR		= 0x03,
+	NVME_TCP_FES_DATA_OUT_OF_RANGE		= 0x04,
+	NVME_TCP_FES_R2T_LIMIT_EXCEEDED		= 0x05,
+	NVME_TCP_FES_DATA_LIMIT_EXCEEDED	= 0x05,
+	NVME_TCP_FES_UNSUPPORTED_PARAM		= 0x06,
+};
+
+enum nvme_tcp_digest_option {
+	NVME_TCP_HDR_DIGEST_ENABLE	= (1 << 0),
+	NVME_TCP_DATA_DIGEST_ENABLE	= (1 << 1),
+};
+
+enum nvme_tcp_pdu_type {
+	nvme_tcp_icreq		= 0x0,
+	nvme_tcp_icresp		= 0x1,
+	nvme_tcp_h2c_term	= 0x2,
+	nvme_tcp_c2h_term	= 0x3,
+	nvme_tcp_cmd		= 0x4,
+	nvme_tcp_rsp		= 0x5,
+	nvme_tcp_h2c_data	= 0x6,
+	nvme_tcp_c2h_data	= 0x7,
+	nvme_tcp_r2t		= 0x9,
+};
+
+enum nvme_tcp_pdu_flags {
+	NVME_TCP_F_HDGST		= (1 << 0),
+	NVME_TCP_F_DDGST		= (1 << 1),
+	NVME_TCP_F_DATA_LAST		= (1 << 2),
+	NVME_TCP_F_DATA_SUCCESS		= (1 << 3),
+};
+
+/**
+ * struct nvme_tcp_hdr - nvme tcp pdu common header
+ *
+ * @type:          pdu type
+ * @flags:         pdu specific flags
+ * @hlen:          pdu header length
+ * @pdo:           pdu data offset
+ * @plen:          pdu wire byte length
+ */
+struct nvme_tcp_hdr {
+	__u8	type;
+	__u8	flags;
+	__u8	hlen;
+	__u8	pdo;
+	__le32	plen;
+};
+
+/**
+ * struct nvme_tcp_icreq_pdu - nvme tcp initialize connection request pdu
+ *
+ * @hdr:           pdu generic header
+ * @pfv:           pdu version format
+ * @hpda:          host pdu data alignment (dwords, 0's based)
+ * @digest:        digest types enabled
+ * @maxr2t:        maximum r2ts per request supported
+ */
+struct nvme_tcp_icreq_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			pfv;
+	__u8			hpda;
+	__u8			digest;
+	__le32			maxr2t;
+	__u8			rsvd2[112];
+};
+
+/**
+ * struct nvme_tcp_icresp_pdu - nvme tcp initialize connection response pdu
+ *
+ * @hdr:           pdu common header
+ * @pfv:           pdu version format
+ * @cpda:          controller pdu data alignment (dowrds, 0's based)
+ * @digest:        digest types enabled
+ * @maxdata:       maximum data capsules per r2t supported
+ */
+struct nvme_tcp_icresp_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			pfv;
+	__u8			cpda;
+	__u8			digest;
+	__le32			maxdata;
+	__u8			rsvd[112];
+};
+
+/**
+ * struct nvme_tcp_term_pdu - nvme tcp terminate connection pdu
+ *
+ * @hdr:           pdu common header
+ * @fes:           fatal error status
+ * @fei:           fatal error information
+ */
+struct nvme_tcp_term_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			fes;
+	__le32			fei;
+	__u8			rsvd[8];
+};
+
+/**
+ * struct nvme_tcp_cmd_pdu - nvme tcp command capsule pdu
+ *
+ * @hdr:           pdu common header
+ * @cmd:           nvme command
+ */
+struct nvme_tcp_cmd_pdu {
+	struct nvme_tcp_hdr	hdr;
+	struct nvme_command	cmd;
+};
+
+/**
+ * struct nvme_tcp_rsp_pdu - nvme tcp response capsule pdu
+ *
+ * @hdr:           pdu common header
+ * @hdr:           nvme-tcp generic header
+ * @cqe:           nvme completion queue entry
+ */
+struct nvme_tcp_rsp_pdu {
+	struct nvme_tcp_hdr	hdr;
+	struct nvme_completion	cqe;
+};
+
+/**
+ * struct nvme_tcp_r2t_pdu - nvme tcp ready-to-transfer pdu
+ *
+ * @hdr:           pdu common header
+ * @command_id:    nvme command identifier which this relates to
+ * @ttag:          transfer tag (controller generated)
+ * @r2t_offset:    offset from the start of the command data
+ * @r2t_length:    length the host is allowed to send
+ */
+struct nvme_tcp_r2t_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__u16			command_id;
+	__u16			ttag;
+	__le32			r2t_offset;
+	__le32			r2t_length;
+	__u8			rsvd[4];
+};
+
+/**
+ * struct nvme_tcp_data_pdu - nvme tcp data pdu
+ *
+ * @hdr:           pdu common header
+ * @command_id:    nvme command identifier which this relates to
+ * @ttag:          transfer tag (controller generated)
+ * @data_offset:   offset from the start of the command data
+ * @data_length:   length of the data stream
+ */
+struct nvme_tcp_data_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__u16			command_id;
+	__u16			ttag;
+	__le32			data_offset;
+	__le32			data_length;
+	__u8			rsvd[4];
+};
+
+union nvme_tcp_pdu {
+	struct nvme_tcp_icreq_pdu	icreq;
+	struct nvme_tcp_icresp_pdu	icresp;
+	struct nvme_tcp_cmd_pdu		cmd;
+	struct nvme_tcp_rsp_pdu		rsp;
+	struct nvme_tcp_r2t_pdu		r2t;
+	struct nvme_tcp_data_pdu	data;
+};
+
+#endif /* _LINUX_NVME_TCP_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 88812cb15be0..4d7907e3771e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -52,6 +52,7 @@ enum {
 enum {
 	NVMF_TRTYPE_RDMA	= 1,	/* RDMA */
 	NVMF_TRTYPE_FC		= 2,	/* Fibre Channel */
+	NVMF_TRTYPE_TCP		= 3,	/* TCP/IP */
 	NVMF_TRTYPE_LOOP	= 254,	/* Reserved for host usage */
 	NVMF_TRTYPE_MAX,
 };
-- 
cgit v1.2.3


From b7c8f3663d0e0773aca3324c26bce3ca8343ec14 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:37 -0800
Subject: nvme: remove nvme_common command cdw10 array

This is a preparation patch which removes the nvme common command cdw10
array and replace with individual fields. This is needed for the nvmet
error log page implementation make is error log page entry offset
assignment easier.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4d7907e3771e..b94fe8fadc4f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -662,7 +662,12 @@ struct nvme_common_command {
 	__le32			cdw2[2];
 	__le64			metadata;
 	union nvme_data_ptr	dptr;
-	__le32			cdw10[6];
+	__le32			cdw10;
+	__le32			cdw11;
+	__le32			cdw12;
+	__le32			cdw13;
+	__le32			cdw14;
+	__le32			cdw15;
 };
 
 struct nvme_rw_command {
-- 
cgit v1.2.3


From b34de7cee0a65f2557bb05447fbe2cc7a9c46750 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:38 -0800
Subject: nvme: add error log page slot definition

This patch adds the NVMe error slot definition from the spec.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b94fe8fadc4f..bbcc83886899 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1168,6 +1168,20 @@ struct nvme_command {
 	};
 };
 
+struct nvme_error_slot {
+	__le64		error_count;
+	__le16		sqid;
+	__le16		cmdid;
+	__le16		status_field;
+	__le16		param_error_location;
+	__le64		lba;
+	__le32		nsid;
+	__u8		vs;
+	__u8		resv[3];
+	__le64		cs;
+	__u8		resv2[24];
+};
+
 static inline bool nvme_is_write(struct nvme_command *cmd)
 {
 	/*
-- 
cgit v1.2.3


From e42b3867de4bd5ee3a1849afb68a1fa8627f7282 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:54 -0800
Subject: blk-mq-rdma: pass in queue map to blk_mq_rdma_map_queues

Will be used by nvme-rdma for queue map separation support.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk-mq-rdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
index b4ade198007d..7b6ecf9ac4c3 100644
--- a/include/linux/blk-mq-rdma.h
+++ b/include/linux/blk-mq-rdma.h
@@ -4,7 +4,7 @@
 struct blk_mq_tag_set;
 struct ib_device;
 
-int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
 		struct ib_device *dev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_RDMA_H */
-- 
cgit v1.2.3


From cc56694f132a8f5fa9334e3afe990de8c3378866 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 09:46:00 +0800
Subject: blk-mq-debugfs: support rq_qos

blk-mq-debugfs has been proved as very helpful for debug some
tough issues, such as IO hang.

We have seen blk-wbt related IO hang several times, even inside
Red Hat BZ, there is such report not sovled yet, so this patch
adds support debugfs on rq_qos.

Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 81f1b105946b..45552e6eae1e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -560,6 +560,7 @@ struct request_queue {
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
 	struct dentry		*sched_debugfs_dir;
+	struct dentry		*rqos_debugfs_dir;
 #endif
 
 	bool			mq_sysfs_init_done;
-- 
cgit v1.2.3


From 13369816cb648f897ce9cbf57e55eeb742ce4eb3 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Mon, 17 Dec 2018 11:03:51 -0500
Subject: block: fix blk-iolatency accounting underflow

The blk-iolatency controller measures the time from rq_qos_throttle() to
rq_qos_done_bio() and attributes this time to the first bio that needs
to create the request. This means if a bio is plug-mergeable or
bio-mergeable, it gets to bypass the blk-iolatency controller.

The recent series [1], to tag all bios w/ blkgs undermined how iolatency
was determining which bios it was charging and should process in
rq_qos_done_bio(). Because all bios are being tagged, this caused the
atomic_t for the struct rq_wait inflight count to underflow and result
in a stall.

This patch adds a new flag BIO_TRACKED to let controllers know that a
bio is going through the rq_qos path. blk-iolatency now checks if this
flag is set to see if it should process the bio in rq_qos_done_bio().

Overloading BLK_QUEUE_ENTERED works, but makes the flag rules confusing.
BIO_THROTTLED was another candidate, but the flag is set for all bios
that have gone through blk-throttle code. Overloading a flag comes with
the burden of making sure that when either implementation changes, a
change in setting rules for one doesn't cause a bug in the other. So
here, we unfortunately opt for adding a new flag.

[1] https://lore.kernel.org/lkml/20181205171039.73066-1-dennis@kernel.org/

Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device")
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 46c005d601ac..fc99474ac968 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -228,6 +228,7 @@ struct bio {
 #define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
 				 * of this bio. */
 #define BIO_QUEUE_ENTERED 11	/* can use blk_queue_enter_live() */
+#define BIO_TRACKED 12		/* set if bio goes through the rq_qos path */
 
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
-- 
cgit v1.2.3


From 3c94d83cb352627f221d971b05f163c17527de74 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 17 Dec 2018 21:11:17 -0700
Subject: blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight()

There's a single user of this function, dm, and dm just wants
to check if IO is inflight, not that it's just allocated.

This fixes a hang with srp/002 in blktests with dm, where it tries
to suspend but waits for inflight IO to finish first. As it checks
for just allocated requests, this fails.

Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 57eda7b20243..d3c0a0d2680b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -257,7 +257,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
-bool blk_mq_queue_busy(struct request_queue *q);
+bool blk_mq_queue_inflight(struct request_queue *q);
 
 enum {
 	/* return when out of requests */
-- 
cgit v1.2.3


From 7b7ab780a048699d2b9f416bf2d5c089d8d1028c Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 14 Dec 2018 11:06:06 -0800
Subject: block: make request_to_qc_t public

block consumers will need it for polling requests that
are sent with blk_execute_rq_nowait. Also, get rid of
blk_tag_to_qc_t and open-code it instead.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk-mq.h    | 10 ++++++++++
 include/linux/blk_types.h | 11 -----------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3c0a0d2680b..0e030f5f76b6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -357,4 +357,14 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
 	     ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
 
+static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
+		struct request *rq)
+{
+	if (rq->tag != -1)
+		return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
+
+	return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
+			BLK_QC_T_INTERNAL;
+}
+
 #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fc99474ac968..5c7e7f859a24 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -425,17 +425,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie)
 	return cookie != BLK_QC_T_NONE;
 }
 
-static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num,
-				       bool internal)
-{
-	blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT);
-
-	if (internal)
-		ret |= BLK_QC_T_INTERNAL;
-
-	return ret;
-}
-
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
 {
 	return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
-- 
cgit v1.2.3


From 9f6b7ef6c3ebe35be77b0ae3cf12e4d25ae80420 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 20 Dec 2018 08:49:00 -0700
Subject: sbitmap: add helpers for add/del wait queue handling

After commit 5d2ee7122c73, users of sbitmap that need wait queue
handling must use the provided helpers. But we only added
prepare_to_wait()/finish_wait() style helpers, add the equivalent
add_wait_queue/list_del wrappers as we..

This is needed to ensure kyber plays by the sbitmap waitqueue
rules.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 03f50fcedc79..14d558146aea 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -560,13 +560,13 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
 struct sbq_wait {
-	int accounted;
+	struct sbitmap_queue *sbq;	/* if set, sbq_wait is accounted */
 	struct wait_queue_entry wait;
 };
 
 #define DEFINE_SBQ_WAIT(name)							\
 	struct sbq_wait name = {						\
-		.accounted = 0,							\
+		.sbq = NULL,							\
 		.wait = {							\
 			.private	= current,				\
 			.func		= autoremove_wake_function,		\
@@ -588,4 +588,16 @@ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
 void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
 				struct sbq_wait *sbq_wait);
 
+/*
+ * Wrapper around add_wait_queue(), which maintains some extra internal state
+ */
+void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
+			    struct sbq_wait_state *ws,
+			    struct sbq_wait *sbq_wait);
+
+/*
+ * Must be paired with sbitmap_add_wait_queue()
+ */
+void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
-- 
cgit v1.2.3