From 26f76dce60d28028e5c1fbbc39e771366a27671f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:15:59 +0100
Subject: lightnvm: use internal pblk methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that rrpc has been removed, the only users of the ppa helpers
is pblk. However, pblk already defines similar functions.

Switch pblk to use the internal ones, and remove the generic ppa
helpers.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2d1d9de06728..14e274b7d094 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -418,25 +418,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev,
 	return l;
 }
 
-static inline int ppa_empty(struct ppa_addr ppa_addr)
-{
-	return (ppa_addr.ppa == ADDR_EMPTY);
-}
-
-static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
-{
-	ppa_addr->ppa = ADDR_EMPTY;
-}
-
-static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
-{
-	if (ppa_empty(ppa1) || ppa_empty(ppa2))
-		return 0;
-
-	return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) &&
-					(ppa1.g.blk == ppa2.g.blk));
-}
-
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
 typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
-- 
cgit v1.2.3


From e3e13bcc14717800e3e3239ca3faac24f2f04575 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:16:00 +0100
Subject: lightnvm: remove hybrid ocssd 1.2 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that rrpc have been removed. Also remove the hybrid 1.2 support
from the core.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 14e274b7d094..97ceb841e9a0 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -50,10 +50,7 @@ struct nvm_id;
 struct nvm_dev;
 struct nvm_tgt_dev;
 
-typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
-typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
-				nvm_l2p_update_fn *, void *);
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
@@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
 
 struct nvm_dev_ops {
 	nvm_id_fn		*identity;
-	nvm_get_l2p_tbl_fn	*get_l2p_tbl;
 	nvm_op_bb_tbl_fn	*get_bb_tbl;
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
@@ -112,8 +108,6 @@ enum {
 	NVM_RSP_WARN_HIGHECC	= 0x4700,
 
 	/* Device opcodes */
-	NVM_OP_HBREAD		= 0x02,
-	NVM_OP_HBWRITE		= 0x81,
 	NVM_OP_PWRITE		= 0x91,
 	NVM_OP_PREAD		= 0x92,
 	NVM_OP_ERASE		= 0x90,
@@ -346,36 +340,6 @@ struct nvm_dev {
 	struct list_head targets;
 };
 
-static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
-						     u64 pba)
-{
-	struct ppa_addr l;
-	int secs, pgs, blks, luns;
-	sector_t ppa = pba;
-
-	l.ppa = 0;
-
-	div_u64_rem(ppa, geo->sec_per_pg, &secs);
-	l.g.sec = secs;
-
-	sector_div(ppa, geo->sec_per_pg);
-	div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
-	l.g.pg = pgs;
-
-	sector_div(ppa, geo->pgs_per_blk);
-	div_u64_rem(ppa, geo->blks_per_lun, &blks);
-	l.g.blk = blks;
-
-	sector_div(ppa, geo->blks_per_lun);
-	div_u64_rem(ppa, geo->luns_per_chnl, &luns);
-	l.g.lun = luns;
-
-	sector_div(ppa, geo->luns_per_chnl);
-	l.g.ch = ppa;
-
-	return l;
-}
-
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev,
 						  struct ppa_addr r)
 {
@@ -462,17 +426,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
-			   void *);
-extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
-extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
-
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
-- 
cgit v1.2.3


From 98281a90acc04d8a10407dabd2e397e4312b80c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <javier@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:01 +0100
Subject: lightnvm: remove unnecessary field from nvm_rq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the wait filed in nvm_rq. It is not used anymore, as targets rely
on the functionality provided by the LightNVM subsystem when sending
sync I/O.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 97ceb841e9a0..07cdb05a9a87 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -233,7 +233,6 @@ struct nvm_rq {
 	void *meta_list;
 	dma_addr_t dma_meta_list;
 
-	struct completion *wait;
 	nvm_end_io_fn *end_io;
 
 	uint8_t opcode;
-- 
cgit v1.2.3


From bb27aa9ecd1f72e68b0fa2dffeb45bee3b1cb5ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 5 Jan 2018 14:16:02 +0100
Subject: lightnvm: remove lower page tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lower page table is unused. All page tables reported by 1.2
devices are all reporting a sequential 1:1 page mapping. This is
also not used going forward with the 2.0 revision.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 07cdb05a9a87..a5d8e0cbbb46 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -174,8 +174,6 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
-
-	struct nvm_id_lp_tbl lptbl;
 };
 
 struct nvm_addr_format {
@@ -313,10 +311,6 @@ struct nvm_dev {
 	/* Device information */
 	struct nvm_geo geo;
 
-	  /* lower page table */
-	int lps_per_blk;
-	int *lptbl;
-
 	unsigned long total_secs;
 
 	unsigned long *lun_map;
-- 
cgit v1.2.3


From fae7fae4077c24dc2be720b9f21f53adea98d7dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <matias@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:03 +0100
Subject: lightnvm: make geometry structures 2.0 ready
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for the 2.0 revision by adapting the geometry
structures to coexist with the 1.2 revision.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 52 ++++++++++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index a5d8e0cbbb46..8e43bfebd38d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -159,12 +159,16 @@ struct nvm_id_group {
 	u8	fmtype;
 	u8	num_ch;
 	u8	num_lun;
-	u8	num_pln;
-	u16	num_blk;
-	u16	num_pg;
-	u16	fpg_sz;
+	u16	num_chk;
+	u16	clba;
 	u16	csecs;
 	u16	sos;
+
+	u16	ws_min;
+	u16	ws_opt;
+	u16	ws_seq;
+	u16	ws_per_chk;
+
 	u32	trdt;
 	u32	trdm;
 	u32	tprt;
@@ -174,6 +178,11 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
+
+	/* 1.2 compatibility */
+	u8	num_pln;
+	u16	num_pg;
+	u16	fpg_sz;
 };
 
 struct nvm_addr_format {
@@ -259,31 +268,36 @@ enum {
 	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
 };
 
+
 /* Device generic information */
 struct nvm_geo {
+	/* generic geometry */
 	int nr_chnls;
-	int nr_luns;
-	int luns_per_chnl; /* -1 if channels are not symmetric */
-	int nr_planes;
-	int sec_per_pg; /* only sectors for a single page */
-	int pgs_per_blk;
-	int blks_per_lun;
-	int fpg_size;
-	int pfpg_size; /* size of buffer if all pages are to be read */
+	int all_luns; /* across channels */
+	int nr_luns; /* per channel */
+	int nr_chks; /* per lun */
+
 	int sec_size;
 	int oob_size;
 	int mccap;
-	struct nvm_addr_format ppaf;
 
-	/* Calculated/Cached values. These do not reflect the actual usable
-	 * blocks at run-time.
-	 */
+	int sec_per_chk;
+	int sec_per_lun;
+
+	int ws_min;
+	int ws_opt;
+	int ws_seq;
+	int ws_per_chk;
+
 	int max_rq_size;
-	int plane_mode; /* drive device in single, double or quad mode */
 
+	struct nvm_addr_format ppaf;
+
+	/* Legacy 1.2 specific geometry */
+	int plane_mode; /* drive device in single, double or quad mode */
+	int nr_planes;
+	int sec_per_pg; /* only sectors for a single page */
 	int sec_per_pl; /* all sectors across planes */
-	int sec_per_blk;
-	int sec_per_lun;
 };
 
 /* sub-device structure */
-- 
cgit v1.2.3


From e53927393b9987b7c986b6364c27111077f0ea3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <javier@cnexlabs.com>
Date: Fri, 5 Jan 2018 14:16:14 +0100
Subject: lightnvm: set target over-provision on create ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow to set the over-provision percentage on target creation. In case
that the value is not provided, fall back to the default value set by
the target.

In pblk, set the default OP to 11% of the total size of the device

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 8e43bfebd38d..7f4b60abdf27 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -218,6 +218,10 @@ struct nvm_target {
 
 #define ADDR_EMPTY (~0ULL)
 
+#define NVM_TARGET_DEFAULT_OP (101)
+#define NVM_TARGET_MIN_OP (3)
+#define NVM_TARGET_MAX_OP (80)
+
 #define NVM_VERSION_MAJOR 1
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
@@ -291,6 +295,8 @@ struct nvm_geo {
 
 	int max_rq_size;
 
+	int op;
+
 	struct nvm_addr_format ppaf;
 
 	/* Legacy 1.2 specific geometry */
-- 
cgit v1.2.3


From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Dec 2017 15:43:38 +0900
Subject: block: introduce zoned block devices zone write locking

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..46e606f5b44b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
    bio chain. */
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,22 @@ struct request_queue {
 
 	struct queue_limits	limits;
 
+	/*
+	 * Zoned block device information for request dispatch control.
+	 * nr_zones is the total number of zones of the device. This is always
+	 * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+	 * bits which indicates if a zone is conventional (bit clear) or
+	 * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+	 * bits which indicates if a zone is write locked, that is, if a write
+	 * request targeting the zone was dispatched. All three fields are
+	 * initialized by the low level device driver (e.g. scsi/sd.c).
+	 * Stacking drivers (device mappers) may or may not initialize
+	 * these fields.
+	 */
+	unsigned int		nr_zones;
+	unsigned long		*seq_zones_bitmap;
+	unsigned long		*seq_zones_wlock;
+
 	/*
 	 * sg stuff
 	 */
@@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+	return q->nr_zones;
+}
+
+static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+					     sector_t sector)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
+	return sector >> ilog2(q->limits.chunk_sectors);
+}
+
+static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+					 sector_t sector)
+{
+	if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+		return false;
+	return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+}
+
 static inline bool rq_is_sync(struct request *rq)
 {
 	return op_is_sync(rq->cmd_flags);
@@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+	return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
  * is different from the size of the request.  Any driver that supports such
@@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
 
 	if (q)
 		return blk_queue_zone_sectors(q);
+	return 0;
+}
 
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_nr_zones(q);
 	return 0;
 }
 
@@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+bool blk_req_needs_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_lock(struct request *rq);
+void __blk_req_zone_write_unlock(struct request *rq);
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+	if (blk_req_needs_zone_write_lock(rq))
+		__blk_req_zone_write_lock(rq);
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+		__blk_req_zone_write_unlock(rq);
+}
+
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return rq->q->seq_zones_wlock &&
+		test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	if (!blk_req_needs_zone_write_lock(rq))
+		return true;
+	return !blk_req_zone_is_write_locked(rq);
+}
+#else
+static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	return false;
+}
+
+static inline void blk_req_zone_write_lock(struct request *rq)
+{
+}
+
+static inline void blk_req_zone_write_unlock(struct request *rq)
+{
+}
+static inline bool blk_req_zone_is_write_locked(struct request *rq)
+{
+	return false;
+}
+
+static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+{
+	return true;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From 86292abc5af206f64192a0b60da06fd604debdc0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:03 +0800
Subject: block: introduce bio helpers for converting to multipage bvec

The following helpers are introduced for converting current users of
direct access to bvec table, and prepares for supporting multipage bvec:

	bio_pages_all()
	bio_first_bvec_all()
	bio_first_page_all()
	bio_last_bvec_all()

All are named as bio_*_all() to following bio_for_each_segment_all(),
they can only be used on bio of !bio_flagged(bio, BIO_CLONED), that means
the whole bvec table is covered.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 82f0c8fd7be8..435ddf04e889 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
+static inline unsigned bio_pages_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return bio->bi_vcnt;
+}
+
+static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return bio->bi_io_vec;
+}
+
+static inline struct page *bio_first_page_all(struct bio *bio)
+{
+	return bio_first_bvec_all(bio)->bv_page;
+}
+
+static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	return &bio->bi_io_vec[bio->bi_vcnt - 1];
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
cgit v1.2.3


From 3c892a098b0bfa3e571f1f0d2a7e72fbaeea691a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:07 +0800
Subject: block: bounce: don't access bio->bi_io_vec in copy_to_high_bio_irq

Firstly this patch introduces BVEC_ITER_ALL_INIT for iterating one bio
from start to end.

As we need to support multipage bvecs, don't access bio->bi_io_vec
in copy_to_high_bio_irq(), and just use the standard iterator for that.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bvec.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ec8a4d7af6bd..fe7a22dd133b 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv,
 		((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);	\
 	     bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
 
+/* for iterating one bio from start to end */
+#define BVEC_ITER_ALL_INIT (struct bvec_iter)				\
+{									\
+	.bi_sector	= 0,						\
+	.bi_size	= UINT_MAX,					\
+	.bi_idx		= 0,						\
+	.bi_bvec_done	= 0,						\
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
cgit v1.2.3


From 25d8be77e19224d8f21b363d77b5283c5dc21a57 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 18 Dec 2017 20:22:10 +0800
Subject: block: move bio_alloc_pages() to bcache

bcache is the only user of bio_alloc_pages(), so move this function into
bcache, and avoid it being misused in the future.

Also rename it to bch_bio_allo_pages() since it is bcache only.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 435ddf04e889..367a979fd4a6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -500,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
-- 
cgit v1.2.3


From e80a0af4759a164214f02da157a3800753ce135f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 5 Jan 2018 08:26:46 -0800
Subject: lib/scatterlist: Introduce sgl_alloc() and sgl_free()

Many kernel drivers contain code that allocates and frees both a
scatterlist and the pages that populate that scatterlist.
Introduce functions in lib/scatterlist.c that perform these tasks
instead of duplicating this functionality in multiple drivers.
Only include these functions in the build if CONFIG_SGL_ALLOC=y
to avoid that the kernel size increases if this functionality is
not used.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/scatterlist.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b7c83254c566..b8a7c1d1dbe3 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -276,6 +276,16 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
 			      unsigned int n_pages, unsigned int offset,
 			      unsigned long size, gfp_t gfp_mask);
 
+#ifdef CONFIG_SGL_ALLOC
+struct scatterlist *sgl_alloc_order(unsigned long long length,
+				    unsigned int order, bool chainable,
+				    gfp_t gfp, unsigned int *nent_p);
+struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
+			      unsigned int *nent_p);
+void sgl_free_order(struct scatterlist *sgl, int order);
+void sgl_free(struct scatterlist *sgl);
+#endif /* CONFIG_SGL_ALLOC */
+
 size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
 		      size_t buflen, off_t skip, bool to_buffer);
 
-- 
cgit v1.2.3


From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:48 -0800
Subject: blk-mq: replace timeout synchronization with a RCU and generation
 based scheme

Currently, blk-mq timeout path synchronizes against the usual
issue/completion path using a complex scheme involving atomic
bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence
rules.  Unfortunately, it contains quite a few holes.

There's a complex dancing around REQ_ATOM_STARTED and
REQ_ATOM_COMPLETE between issue/completion and timeout paths; however,
they don't have a synchronization point across request recycle
instances and it isn't clear what the barriers add.
blk_mq_check_expired() can easily read STARTED from N-2'th iteration,
deadline from N-1'th, blk_mark_rq_complete() against Nth instance.

In fact, it's pretty easy to make blk_mq_check_expired() terminate a
later instance of a request.  If we induce 5 sec delay before
time_after_eq() test in blk_mq_check_expired(), shorten the timeout to
2s, and issue back-to-back large IOs, blk-mq starts timing out
requests spuriously pretty quickly.  Nothing actually timed out.  It
just made the call on a recycle instance of a request and then
terminated a later instance long after the original instance finished.
The scenario isn't theoretical either.

This patch replaces the broken synchronization mechanism with a RCU
and generation number based one.

1. Each request has a u64 generation + state value, which can be
   updated only by the request owner.  Whenever a request becomes
   in-flight, the generation number gets bumped up too.  This provides
   the basis for the timeout path to distinguish different recycle
   instances of the request.

   Also, marking a request in-flight and setting its deadline are
   protected with a seqcount so that the timeout path can fetch both
   values coherently.

2. The timeout path fetches the generation, state and deadline.  If
   the verdict is timeout, it records the generation into a dedicated
   request abortion field and does RCU wait.

3. The completion path is also protected by RCU (from the previous
   patch) and checks whether the current generation number and state
   match the abortion field.  If so, it skips completion.

4. The timeout path, after RCU wait, scans requests again and
   terminates the ones whose generation and state still match the ones
   requested for abortion.

   By now, the timeout path knows that either the generation number
   and state changed if it lost the race or the completion will yield
   to it and can safely timeout the request.

While it's more lines of code, it's conceptually simpler, doesn't
depend on direct use of subtle memory ordering or coherence, and
hopefully doesn't terminate the wrong instance.

While this change makes REQ_ATOM_COMPLETE synchronization unnecessary
between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't
removed yet as it's still used in other places.  Future patches will
move all state tracking to the new mechanism and remove all bitops in
the hot paths.

Note that this patch adds a comment explaining a race condition in
BLK_EH_RESET_TIMER path.  The race has always been there and this
patch doesn't change it.  It's just documenting the existing race.

v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao.
    - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter.
    - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter.

v3: - Fixed possible extended seqcount / u64_stats_sync read looping
      spotted by Peter.
    - MQ_RQ_IDLE was incorrectly being set in complete_request instead
      of free_request.  Fixed.

v4: - Rebased on top of hctx_lock() refactoring patch.
    - Added comment explaining the use of hctx_lock() in completion path.

v5: - Added comments requested by Bart.
    - Note the addition of BLK_EH_RESET_TIMER race condition in the
      commit message.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e2..460798dbac1f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
 	unsigned int		queue_num;
 
 	atomic_t		nr_active;
+	unsigned int		nr_expired;
 
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46e606f5b44b..ae563d01b29d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -230,6 +232,27 @@ struct request {
 
 	unsigned short write_hint;
 
+	/*
+	 * On blk-mq, the lower bits of ->gstate (generation number and
+	 * state) carry the MQ_RQ_* state value and the upper bits the
+	 * generation number which is monotonically incremented and used to
+	 * distinguish the reuse instances.
+	 *
+	 * ->gstate_seq allows updates to ->gstate and other fields
+	 * (currently ->deadline) during request start to be read
+	 * atomically from the timeout path, so that it can operate on a
+	 * coherent set of information.
+	 */
+	seqcount_t gstate_seq;
+	u64 gstate;
+
+	/*
+	 * ->aborted_gstate is used by the timeout to claim a specific
+	 * recycle instance of this request.  See blk_mq_timeout_work().
+	 */
+	struct u64_stats_sync aborted_gstate_sync;
+	u64 aborted_gstate;
+
 	unsigned long deadline;
 	struct list_head timeout_list;
 
-- 
cgit v1.2.3


From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:51 -0800
Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq

After the recent updates to use generation number and state based
synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except
to avoid firing the same timeout multiple times.

Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag
RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple
times.  This removes atomic bitops from hot paths too.

v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out().

v3: Added RQF_MQ_TIMEOUT_EXPIRED flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ae563d01b29d..007a7cf1f262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* timeout is expired */
+#define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 05707b64aed8f5f1674b25334fb720d651459d5e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Jan 2018 08:29:53 -0800
Subject: blk-mq: rename blk_mq_hw_ctx->queue_rq_srcu to ->srcu

The RCU protection has been expanded to cover both queueing and
completion paths making ->queue_rq_srcu a misnomer.  Rename it to
->srcu as suggested by Bart.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 460798dbac1f..8efcf49796a3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -66,7 +66,7 @@ struct blk_mq_hw_ctx {
 #endif
 
 	/* Must be the last member - see also blk_mq_hw_ctx_size(). */
-	struct srcu_struct	queue_rq_srcu[0];
+	struct srcu_struct	srcu[0];
 };
 
 struct blk_mq_tag_set {
-- 
cgit v1.2.3


From 9111e5686c8cf3905191d4feb819acd874437500 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 9 Jan 2018 12:04:16 -0700
Subject: block: Provide blk_status_t decoding for path errors

This patch provides a common decoder for block status path related errors
that may be retried so various entities wishing to consult this do not
have to duplicate this decision.

Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1e628e032da..2d973ac54b09 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t;
 
 #define BLK_STS_AGAIN		((__force blk_status_t)12)
 
+/**
+ * blk_path_error - returns true if error may be path related
+ * @error: status the request was completed with
+ *
+ * Description:
+ *     This classifies block error status into non-retryable errors and ones
+ *     that may be successful if retried on a failover path.
+ *
+ * Return:
+ *     %false - retrying failover path will not help
+ *     %true  - may succeed if retried
+ */
+static inline bool blk_path_error(blk_status_t error)
+{
+	switch (error) {
+	case BLK_STS_NOTSUPP:
+	case BLK_STS_NOSPC:
+	case BLK_STS_TARGET:
+	case BLK_STS_NEXUS:
+	case BLK_STS_MEDIUM:
+	case BLK_STS_PROTECTION:
+		return false;
+	}
+
+	/* Anything else could be a path failure, so should be retried */
+	return true;
+}
+
 struct blk_issue_stat {
 	u64 stat;
 };
-- 
cgit v1.2.3


From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:30:56 -0700
Subject: block: remove REQ_ATOM_POLL_SLEPT

We don't need this to be an atomic flag, it can be a regular
flag. We either end up on the same CPU for the polling, in which
case the state is sane, or we did the sleep which would imply
the needed barrier to ensure we see the right state.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 007a7cf1f262..ba31674d8581 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* timeout is expired */
 #define RQF_MQ_TIMEOUT_EXPIRED	((__force req_flags_t)(1 << 20))
+/* already slept for hybrid poll */
+#define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 21))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-- 
cgit v1.2.3


From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Jan 2018 14:23:42 -0700
Subject: block: add accessors for setting/querying request deadline

We reduce the resolution of request expiry, but since we're already
using jiffies for this where resolution depends on the kernel
configuration and since the timeout resolution is coarse anyway,
that should be fine.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba31674d8581..aa6698cf483c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -257,7 +257,9 @@ struct request {
 	struct u64_stats_sync aborted_gstate_sync;
 	u64 aborted_gstate;
 
-	unsigned long deadline;
+	/* access through blk_rq_set_deadline, blk_rq_deadline */
+	unsigned long __deadline;
+
 	struct list_head timeout_list;
 
 	/*
-- 
cgit v1.2.3


From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:34:25 -0700
Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit

We only have one atomic flag left. Instead of using an entire
unsigned long for that, steal the bottom bit of the deadline
field that we already reserved.

Remove ->atomic_flags, since it's now unused.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aa6698cf483c..d4b2f7bb18d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -156,8 +156,6 @@ struct request {
 
 	int internal_tag;
 
-	unsigned long atomic_flags;
-
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	int tag;
-- 
cgit v1.2.3


From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Jan 2018 11:46:39 -0700
Subject: block: rearrange a few request fields for better cache layout

Move completion related items (like the call single data) near the
end of the struct, instead of mixing them in with the initial
queueing related fields.

Move queuelist below the bio structures. Then we have all
queueing related bits in the first cache line.

This yields a 1.5-2% increase in IOPS for a null_blk test, both for
sync and for high thread count access. Sync test goes form 975K to
992K, 32-thread case from 20.8M to 21.2M IOPS.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4b2f7bb18d6..71a9371c8182 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t;
  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
-	struct list_head queuelist;
-	union {
-		call_single_data_t csd;
-		u64 fifo_time;
-	};
-
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
@@ -164,6 +158,8 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
+	struct list_head queuelist;
+
 	/*
 	 * The hash is used inside the scheduler, and killed once the
 	 * request reaches the dispatch list. The ipi_list is only used
@@ -211,19 +207,16 @@ struct request {
 	struct hd_struct *part;
 	unsigned long start_time;
 	struct blk_issue_stat issue_stat;
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
-#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	unsigned short nr_integrity_segments;
 #endif
 
+	unsigned short write_hint;
 	unsigned short ioprio;
 
 	unsigned int timeout;
@@ -232,8 +225,6 @@ struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	unsigned short write_hint;
-
 	/*
 	 * On blk-mq, the lower bits of ->gstate (generation number and
 	 * state) carry the MQ_RQ_* state value and the upper bits the
@@ -260,6 +251,11 @@ struct request {
 
 	struct list_head timeout_list;
 
+	union {
+		call_single_data_t csd;
+		u64 fifo_time;
+	};
+
 	/*
 	 * completion callback.
 	 */
@@ -268,6 +264,12 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+#ifdef CONFIG_BLK_CGROUP
+	struct request_list *rl;		/* rl this rq is alloced from */
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 };
 
 static inline bool blk_rq_is_scsi(struct request *rq)
-- 
cgit v1.2.3


From fa70d2e2c4a0a54ced98260c6a176cc94c876d27 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 8 Jan 2018 22:01:13 -0500
Subject: block: allow gendisk's request_queue registration to be deferred

Since I can remember DM has forced the block layer to allow the
allocation and initialization of the request_queue to be distinct
operations.  Reason for this is block/genhd.c:add_disk() has requires
that the request_queue (and associated bdi) be tied to the gendisk
before add_disk() is called -- because add_disk() also deals with
exposing the request_queue via blk_register_queue().

DM's dynamic creation of arbitrary device types (and associated
request_queue types) requires the DM device's gendisk be available so
that DM table loads can establish a master/slave relationship with
subordinate devices that are referenced by loaded DM tables -- using
bd_link_disk_holder().  But until these DM tables, and their associated
subordinate devices, are known DM cannot know what type of request_queue
it needs -- nor what its queue_limits should be.

This chicken and egg scenario has created all manner of problems for DM
and, at times, the block layer.

Summary of changes:

- Add device_add_disk_no_queue_reg() and add_disk_no_queue_reg() variant
  that drivers may use to add a disk without also calling
  blk_register_queue().  Driver must call blk_register_queue() once its
  request_queue is fully initialized.

- Return early from blk_unregister_queue() if QUEUE_FLAG_REGISTERED
  is not set.  It won't be set if driver used add_disk_no_queue_reg()
  but driver encounters an error and must del_gendisk() before calling
  blk_register_queue().

- Export blk_register_queue().

These changes allow DM to use add_disk_no_queue_reg() to anchor its
gendisk as the "master" for master/slave relationships DM must establish
with subordinate devices referenced in DM tables that get loaded.  Once
all "slave" devices for a DM device are known its request_queue can be
properly initialized and then advertised via sysfs -- important
improvement being that no request_queue resource initialization
performed by blk_register_queue() is missed for DM devices anymore.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/genhd.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
 {
 	device_add_disk(NULL, disk);
 }
+extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
+static inline void add_disk_no_queue_reg(struct gendisk *disk)
+{
+	device_add_disk_no_queue_reg(NULL, disk);
+}
 
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
-- 
cgit v1.2.3


From ddc212313f16cd65fcf5e8d9ae223f8374822e4d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 16 Jan 2018 16:01:36 +0100
Subject: blkcg: simplify statistic accumulation code

Some older compilers (gcc-4.4 through 4.6 in particular) struggle
with the way that blkg_rwstat_read() returns a structure, leading
to excessive stack usage and rather inefficient code:

block/blk-cgroup.c: In function 'blkg_destroy':
block/blk-cgroup.c:354:1: error: the frame size of 1296 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]
block/cfq-iosched.c: In function 'cfqg_stats_add_aux':
block/cfq-iosched.c:753:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]
block/bfq-cgroup.c: In function 'bfqg_stats_add_aux':
block/bfq-cgroup.c:299:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

I also notice that there is no point in using atomic accesses
for the local variables, so storing the temporaries in simple 'u64'
variables not only avoids the stack usage on older compilers but
also improves the object code on modern versions.

Fixes: e6269c445467 ("blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it")
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 				       struct blkg_rwstat *from)
 {
-	struct blkg_rwstat v = blkg_rwstat_read(from);
+	u64 sum[BLKG_RWSTAT_NR];
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
-			     atomic64_read(&from->aux_cnt[i]),
+		sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
 			     &to->aux_cnt[i]);
 }
 
-- 
cgit v1.2.3


From 88de4598bca84e27b261685c06fff816b8d932a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Dec 2017 14:50:00 +0100
Subject: nvme-pci: clean up SMBSZ bit definitions

Define the bit positions instead of macros using the magic values,
and move the expanded helpers to calculate the size and size unit into
the implementation C file.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 include/linux/nvme.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aea87f0d917b..4112e2bd747f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -124,14 +124,20 @@ enum {
 
 #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
-#define NVME_CMB_SZ(cmbsz)	(((cmbsz) >> 12) & 0xfffff)
-#define NVME_CMB_SZU(cmbsz)	(((cmbsz) >> 8) & 0xf)
-
-#define NVME_CMB_WDS(cmbsz)	((cmbsz) & 0x10)
-#define NVME_CMB_RDS(cmbsz)	((cmbsz) & 0x8)
-#define NVME_CMB_LISTS(cmbsz)	((cmbsz) & 0x4)
-#define NVME_CMB_CQS(cmbsz)	((cmbsz) & 0x2)
-#define NVME_CMB_SQS(cmbsz)	((cmbsz) & 0x1)
+
+enum {
+	NVME_CMBSZ_SQS		= 1 << 0,
+	NVME_CMBSZ_CQS		= 1 << 1,
+	NVME_CMBSZ_LISTS	= 1 << 2,
+	NVME_CMBSZ_RDS		= 1 << 3,
+	NVME_CMBSZ_WDS		= 1 << 4,
+
+	NVME_CMBSZ_SZ_SHIFT	= 12,
+	NVME_CMBSZ_SZ_MASK	= 0xfffff,
+
+	NVME_CMBSZ_SZU_SHIFT	= 8,
+	NVME_CMBSZ_SZU_MASK	= 0xf,
+};
 
 /*
  * Submission and Completion Queue Entry Sizes for the NVM command set.
-- 
cgit v1.2.3


From 83d016ac86428dbca8a62d3e4fdc29e3ea39e535 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 17 Jan 2018 11:48:08 -0800
Subject: block: Unexport elv_register_queue() and elv_unregister_queue()

These two functions are only called from inside the block layer so
unexport them.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_register_queue(struct request_queue *q);
-extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
-- 
cgit v1.2.3


From 8c7a8d1c4b9c30a2be3b31a2e6af1cefd45574eb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 19 Jan 2018 11:00:54 -0800
Subject: lib/scatterlist: Fix chaining support in sgl_alloc_order()

This patch avoids that workloads with large block sizes (megabytes)
can trigger the following call stack with the ib_srpt driver (that
driver is the only driver that chains scatterlists allocated by
sgl_alloc_order()):

BUG: Bad page state in process kworker/0:1H  pfn:2423a78
page:fffffb03d08e9e00 count:-3 mapcount:0 mapping:          (null) index:0x0
flags: 0x57ffffc0000000()
raw: 0057ffffc0000000 0000000000000000 0000000000000000 fffffffdffffffff
raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000
page dumped because: nonzero _count
CPU: 0 PID: 733 Comm: kworker/0:1H Tainted: G          I      4.15.0-rc7.bart+ #1
Hardware name: HP ProLiant DL380 G7, BIOS P67 08/16/2015
Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
Call Trace:
 dump_stack+0x5c/0x83
 bad_page+0xf5/0x10f
 get_page_from_freelist+0xa46/0x11b0
 __alloc_pages_nodemask+0x103/0x290
 sgl_alloc_order+0x101/0x180
 target_alloc_sgl+0x2c/0x40 [target_core_mod]
 srpt_alloc_rw_ctxs+0x173/0x2d0 [ib_srpt]
 srpt_handle_new_iu+0x61e/0x7f0 [ib_srpt]
 __ib_process_cq+0x55/0xa0 [ib_core]
 ib_cq_poll_work+0x1b/0x60 [ib_core]
 process_one_work+0x141/0x340
 worker_thread+0x47/0x3e0
 kthread+0xf5/0x130
 ret_from_fork+0x1f/0x30

Fixes: e80a0af4759a ("lib/scatterlist: Introduce sgl_alloc() and sgl_free()")
Reported-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/scatterlist.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index b8a7c1d1dbe3..22b2131bcdcd 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -282,6 +282,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length,
 				    gfp_t gfp, unsigned int *nent_p);
 struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
 			      unsigned int *nent_p);
+void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
 void sgl_free_order(struct scatterlist *sgl, int order);
 void sgl_free(struct scatterlist *sgl);
 #endif /* CONFIG_SGL_ALLOC */
-- 
cgit v1.2.3


From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 19 Jan 2018 08:58:56 -0800
Subject: block: Remove kblockd_schedule_delayed_work{,_on}()

The previous patch removed all users of these two functions. Hence
also remove the functions themselves.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 71a9371c8182..afc43fb63c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3