From 26f76dce60d28028e5c1fbbc39e771366a27671f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 5 Jan 2018 14:15:59 +0100 Subject: lightnvm: use internal pblk methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that rrpc has been removed, the only users of the ppa helpers is pblk. However, pblk already defines similar functions. Switch pblk to use the internal ones, and remove the generic ppa helpers. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2d1d9de06728..14e274b7d094 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -418,25 +418,6 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_tgt_dev *tgt_dev, return l; } -static inline int ppa_empty(struct ppa_addr ppa_addr) -{ - return (ppa_addr.ppa == ADDR_EMPTY); -} - -static inline void ppa_set_empty(struct ppa_addr *ppa_addr) -{ - ppa_addr->ppa = ADDR_EMPTY; -} - -static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) -{ - if (ppa_empty(ppa1) || ppa_empty(ppa2)) - return 0; - - return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) && - (ppa1.g.blk == ppa2.g.blk)); -} - typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, -- cgit v1.2.3 From e3e13bcc14717800e3e3239ca3faac24f2f04575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 5 Jan 2018 14:16:00 +0100 Subject: lightnvm: remove hybrid ocssd 1.2 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that rrpc have been removed. Also remove the hybrid 1.2 support from the core. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 14e274b7d094..97ceb841e9a0 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,10 +50,7 @@ struct nvm_id; struct nvm_dev; struct nvm_tgt_dev; -typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, - nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); @@ -66,7 +63,6 @@ typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); struct nvm_dev_ops { nvm_id_fn *identity; - nvm_get_l2p_tbl_fn *get_l2p_tbl; nvm_op_bb_tbl_fn *get_bb_tbl; nvm_op_set_bb_fn *set_bb_tbl; @@ -112,8 +108,6 @@ enum { NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ - NVM_OP_HBREAD = 0x02, - NVM_OP_HBWRITE = 0x81, NVM_OP_PWRITE = 0x91, NVM_OP_PREAD = 0x92, NVM_OP_ERASE = 0x90, @@ -346,36 +340,6 @@ struct nvm_dev { struct list_head targets; }; -static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, - u64 pba) -{ - struct ppa_addr l; - int secs, pgs, blks, luns; - sector_t ppa = pba; - - l.ppa = 0; - - div_u64_rem(ppa, geo->sec_per_pg, &secs); - l.g.sec = secs; - - sector_div(ppa, geo->sec_per_pg); - div_u64_rem(ppa, geo->pgs_per_blk, &pgs); - l.g.pg = pgs; - - sector_div(ppa, geo->pgs_per_blk); - div_u64_rem(ppa, geo->blks_per_lun, &blks); - l.g.blk = blks; - - sector_div(ppa, geo->blks_per_lun); - div_u64_rem(ppa, geo->luns_per_chnl, &luns); - l.g.lun = luns; - - sector_div(ppa, geo->luns_per_chnl); - l.g.ch = ppa; - - return l; -} - static inline struct ppa_addr generic_to_dev_addr(struct nvm_tgt_dev *tgt_dev, struct ppa_addr r) { @@ -462,17 +426,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, - void *); -extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); -extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); - #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 98281a90acc04d8a10407dabd2e397e4312b80c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 5 Jan 2018 14:16:01 +0100 Subject: lightnvm: remove unnecessary field from nvm_rq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the wait filed in nvm_rq. It is not used anymore, as targets rely on the functionality provided by the LightNVM subsystem when sending sync I/O. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 97ceb841e9a0..07cdb05a9a87 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -233,7 +233,6 @@ struct nvm_rq { void *meta_list; dma_addr_t dma_meta_list; - struct completion *wait; nvm_end_io_fn *end_io; uint8_t opcode; -- cgit v1.2.3 From bb27aa9ecd1f72e68b0fa2dffeb45bee3b1cb5ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 5 Jan 2018 14:16:02 +0100 Subject: lightnvm: remove lower page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lower page table is unused. All page tables reported by 1.2 devices are all reporting a sequential 1:1 page mapping. This is also not used going forward with the 2.0 revision. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 07cdb05a9a87..a5d8e0cbbb46 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -174,8 +174,6 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; - - struct nvm_id_lp_tbl lptbl; }; struct nvm_addr_format { @@ -313,10 +311,6 @@ struct nvm_dev { /* Device information */ struct nvm_geo geo; - /* lower page table */ - int lps_per_blk; - int *lptbl; - unsigned long total_secs; unsigned long *lun_map; -- cgit v1.2.3 From fae7fae4077c24dc2be720b9f21f53adea98d7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 5 Jan 2018 14:16:03 +0100 Subject: lightnvm: make geometry structures 2.0 ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for the 2.0 revision by adapting the geometry structures to coexist with the 1.2 revision. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 52 ++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index a5d8e0cbbb46..8e43bfebd38d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -159,12 +159,16 @@ struct nvm_id_group { u8 fmtype; u8 num_ch; u8 num_lun; - u8 num_pln; - u16 num_blk; - u16 num_pg; - u16 fpg_sz; + u16 num_chk; + u16 clba; u16 csecs; u16 sos; + + u16 ws_min; + u16 ws_opt; + u16 ws_seq; + u16 ws_per_chk; + u32 trdt; u32 trdm; u32 tprt; @@ -174,6 +178,11 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; + + /* 1.2 compatibility */ + u8 num_pln; + u16 num_pg; + u16 fpg_sz; }; struct nvm_addr_format { @@ -259,31 +268,36 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; + /* Device generic information */ struct nvm_geo { + /* generic geometry */ int nr_chnls; - int nr_luns; - int luns_per_chnl; /* -1 if channels are not symmetric */ - int nr_planes; - int sec_per_pg; /* only sectors for a single page */ - int pgs_per_blk; - int blks_per_lun; - int fpg_size; - int pfpg_size; /* size of buffer if all pages are to be read */ + int all_luns; /* across channels */ + int nr_luns; /* per channel */ + int nr_chks; /* per lun */ + int sec_size; int oob_size; int mccap; - struct nvm_addr_format ppaf; - /* Calculated/Cached values. These do not reflect the actual usable - * blocks at run-time. - */ + int sec_per_chk; + int sec_per_lun; + + int ws_min; + int ws_opt; + int ws_seq; + int ws_per_chk; + int max_rq_size; - int plane_mode; /* drive device in single, double or quad mode */ + struct nvm_addr_format ppaf; + + /* Legacy 1.2 specific geometry */ + int plane_mode; /* drive device in single, double or quad mode */ + int nr_planes; + int sec_per_pg; /* only sectors for a single page */ int sec_per_pl; /* all sectors across planes */ - int sec_per_blk; - int sec_per_lun; }; /* sub-device structure */ -- cgit v1.2.3 From e53927393b9987b7c986b6364c27111077f0ea3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 5 Jan 2018 14:16:14 +0100 Subject: lightnvm: set target over-provision on create ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow to set the over-provision percentage on target creation. In case that the value is not provided, fall back to the default value set by the target. In pblk, set the default OP to 11% of the total size of the device Signed-off-by: Javier González Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 8e43bfebd38d..7f4b60abdf27 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -218,6 +218,10 @@ struct nvm_target { #define ADDR_EMPTY (~0ULL) +#define NVM_TARGET_DEFAULT_OP (101) +#define NVM_TARGET_MIN_OP (3) +#define NVM_TARGET_MAX_OP (80) + #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 @@ -291,6 +295,8 @@ struct nvm_geo { int max_rq_size; + int op; + struct nvm_addr_format ppaf; /* Legacy 1.2 specific geometry */ -- cgit v1.2.3 From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Dec 2017 15:43:38 +0900 Subject: block: introduce zoned block devices zone write locking Components relying only on the request_queue structure for accessing block devices (e.g. I/O schedulers) have a limited knowledged of the device characteristics. In particular, the device capacity cannot be easily discovered, which for a zoned block device also result in the inability to easily know the number of zones of the device (the zone size is indicated by the chunk_sectors field of the queue limits). Introduce the nr_zones field to the request_queue structure to simplify access to this information. Also, add the bitmap seq_zone_bitmap which indicates which zones of the device are sequential zones (write preferred or write required) and the bitmap seq_zones_wlock which indicates if a zone is write locked, that is, if a write request targeting a zone was dispatched to the device. These fields are initialized by the low level block device driver (sd.c for ZBC/ZAC disks). They are not initialized by stacking drivers (device mappers) handling zoned block devices (e.g. dm-linear). Using this, I/O schedulers can introduce zone write locking to control request dispatching to a zoned block device and avoid write request reordering by limiting to at most a single write request per zone outside of the scheduler at any time. Based on previous patches from Damien Le Moal. Signed-off-by: Christoph Hellwig [Damien] * Fixed comments and identation in blkdev.h * Changed helper functions * Fixed this commit message Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca17db9a..46e606f5b44b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -546,6 +548,22 @@ struct request_queue { struct queue_limits limits; + /* + * Zoned block device information for request dispatch control. + * nr_zones is the total number of zones of the device. This is always + * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones + * bits which indicates if a zone is conventional (bit clear) or + * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones + * bits which indicates if a zone is write locked, that is, if a write + * request targeting the zone was dispatched. All three fields are + * initialized by the low level device driver (e.g. scsi/sd.c). + * Stacking drivers (device mappers) may or may not initialize + * these fields. + */ + unsigned int nr_zones; + unsigned long *seq_zones_bitmap; + unsigned long *seq_zones_wlock; + /* * sg stuff */ @@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +static inline unsigned int blk_queue_nr_zones(struct request_queue *q) +{ + return q->nr_zones; +} + +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q)) + return 0; + return sector >> ilog2(q->limits.chunk_sectors); +} + +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) + return false; + return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); +} + static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); @@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such @@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) if (q) return blk_queue_zone_sectors(q); + return 0; +} +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_nr_zones(q); return 0; } @@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +#ifdef CONFIG_BLK_DEV_ZONED +bool blk_req_needs_zone_write_lock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3 From 86292abc5af206f64192a0b60da06fd604debdc0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:03 +0800 Subject: block: introduce bio helpers for converting to multipage bvec The following helpers are introduced for converting current users of direct access to bvec table, and prepares for supporting multipage bvec: bio_pages_all() bio_first_bvec_all() bio_first_page_all() bio_last_bvec_all() All are named as bio_*_all() to following bio_for_each_segment_all(), they can only be used on bio of !bio_flagged(bio, BIO_CLONED), that means the whole bvec table is covered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 82f0c8fd7be8..435ddf04e889 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -300,6 +300,29 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) bv->bv_len = iter.bi_bvec_done; } +static inline unsigned bio_pages_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return bio->bi_vcnt; +} + +static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return bio->bi_io_vec; +} + +static inline struct page *bio_first_page_all(struct bio *bio) +{ + return bio_first_bvec_all(bio)->bv_page; +} + +static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) +{ + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + return &bio->bi_io_vec[bio->bi_vcnt - 1]; +} + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ -- cgit v1.2.3 From 3c892a098b0bfa3e571f1f0d2a7e72fbaeea691a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:07 +0800 Subject: block: bounce: don't access bio->bi_io_vec in copy_to_high_bio_irq Firstly this patch introduces BVEC_ITER_ALL_INIT for iterating one bio from start to end. As we need to support multipage bvecs, don't access bio->bi_io_vec in copy_to_high_bio_irq(), and just use the standard iterator for that. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ec8a4d7af6bd..fe7a22dd133b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -125,4 +125,13 @@ static inline bool bvec_iter_rewind(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) +/* for iterating one bio from start to end */ +#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ +{ \ + .bi_sector = 0, \ + .bi_size = UINT_MAX, \ + .bi_idx = 0, \ + .bi_bvec_done = 0, \ +} + #endif /* __LINUX_BVEC_ITER_H */ -- cgit v1.2.3 From 25d8be77e19224d8f21b363d77b5283c5dc21a57 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:10 +0800 Subject: block: move bio_alloc_pages() to bcache bcache is the only user of bio_alloc_pages(), so move this function into bcache, and avoid it being misused in the future. Also rename it to bch_bio_allo_pages() since it is bcache only. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 435ddf04e889..367a979fd4a6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -500,7 +500,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); -extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, -- cgit v1.2.3 From e80a0af4759a164214f02da157a3800753ce135f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 5 Jan 2018 08:26:46 -0800 Subject: lib/scatterlist: Introduce sgl_alloc() and sgl_free() Many kernel drivers contain code that allocates and frees both a scatterlist and the pages that populate that scatterlist. Introduce functions in lib/scatterlist.c that perform these tasks instead of duplicating this functionality in multiple drivers. Only include these functions in the build if CONFIG_SGL_ALLOC=y to avoid that the kernel size increases if this functionality is not used. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/scatterlist.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b7c83254c566..b8a7c1d1dbe3 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -276,6 +276,16 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask); +#ifdef CONFIG_SGL_ALLOC +struct scatterlist *sgl_alloc_order(unsigned long long length, + unsigned int order, bool chainable, + gfp_t gfp, unsigned int *nent_p); +struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, + unsigned int *nent_p); +void sgl_free_order(struct scatterlist *sgl, int order); +void sgl_free(struct scatterlist *sgl); +#endif /* CONFIG_SGL_ALLOC */ + size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); -- cgit v1.2.3 From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:48 -0800 Subject: blk-mq: replace timeout synchronization with a RCU and generation based scheme Currently, blk-mq timeout path synchronizes against the usual issue/completion path using a complex scheme involving atomic bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence rules. Unfortunately, it contains quite a few holes. There's a complex dancing around REQ_ATOM_STARTED and REQ_ATOM_COMPLETE between issue/completion and timeout paths; however, they don't have a synchronization point across request recycle instances and it isn't clear what the barriers add. blk_mq_check_expired() can easily read STARTED from N-2'th iteration, deadline from N-1'th, blk_mark_rq_complete() against Nth instance. In fact, it's pretty easy to make blk_mq_check_expired() terminate a later instance of a request. If we induce 5 sec delay before time_after_eq() test in blk_mq_check_expired(), shorten the timeout to 2s, and issue back-to-back large IOs, blk-mq starts timing out requests spuriously pretty quickly. Nothing actually timed out. It just made the call on a recycle instance of a request and then terminated a later instance long after the original instance finished. The scenario isn't theoretical either. This patch replaces the broken synchronization mechanism with a RCU and generation number based one. 1. Each request has a u64 generation + state value, which can be updated only by the request owner. Whenever a request becomes in-flight, the generation number gets bumped up too. This provides the basis for the timeout path to distinguish different recycle instances of the request. Also, marking a request in-flight and setting its deadline are protected with a seqcount so that the timeout path can fetch both values coherently. 2. The timeout path fetches the generation, state and deadline. If the verdict is timeout, it records the generation into a dedicated request abortion field and does RCU wait. 3. The completion path is also protected by RCU (from the previous patch) and checks whether the current generation number and state match the abortion field. If so, it skips completion. 4. The timeout path, after RCU wait, scans requests again and terminates the ones whose generation and state still match the ones requested for abortion. By now, the timeout path knows that either the generation number and state changed if it lost the race or the completion will yield to it and can safely timeout the request. While it's more lines of code, it's conceptually simpler, doesn't depend on direct use of subtle memory ordering or coherence, and hopefully doesn't terminate the wrong instance. While this change makes REQ_ATOM_COMPLETE synchronization unnecessary between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't removed yet as it's still used in other places. Future patches will move all state tracking to the new mechanism and remove all bitops in the hot paths. Note that this patch adds a comment explaining a race condition in BLK_EH_RESET_TIMER path. The race has always been there and this patch doesn't change it. It's just documenting the existing race. v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao. - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter. - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter. v3: - Fixed possible extended seqcount / u64_stats_sync read looping spotted by Peter. - MQ_RQ_IDLE was incorrectly being set in complete_request instead of free_request. Fixed. v4: - Rebased on top of hctx_lock() refactoring patch. - Added comment explaining the use of hctx_lock() in completion path. v5: - Added comments requested by Bart. - Note the addition of BLK_EH_RESET_TIMER race condition in the commit message. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c9a5c862e2..460798dbac1f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { unsigned int queue_num; atomic_t nr_active; + unsigned int nr_expired; struct hlist_node cpuhp_dead; struct kobject kobj; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46e606f5b44b..ae563d01b29d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,8 @@ #include #include #include +#include +#include struct module; struct scsi_ioctl_command; @@ -230,6 +232,27 @@ struct request { unsigned short write_hint; + /* + * On blk-mq, the lower bits of ->gstate (generation number and + * state) carry the MQ_RQ_* state value and the upper bits the + * generation number which is monotonically incremented and used to + * distinguish the reuse instances. + * + * ->gstate_seq allows updates to ->gstate and other fields + * (currently ->deadline) during request start to be read + * atomically from the timeout path, so that it can operate on a + * coherent set of information. + */ + seqcount_t gstate_seq; + u64 gstate; + + /* + * ->aborted_gstate is used by the timeout to claim a specific + * recycle instance of this request. See blk_mq_timeout_work(). + */ + struct u64_stats_sync aborted_gstate_sync; + u64 aborted_gstate; + unsigned long deadline; struct list_head timeout_list; -- cgit v1.2.3 From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:51 -0800 Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq After the recent updates to use generation number and state based synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except to avoid firing the same timeout multiple times. Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple times. This removes atomic bitops from hot paths too. v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out(). v3: Added RQF_MQ_TIMEOUT_EXPIRED flag. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ae563d01b29d..007a7cf1f262 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* timeout is expired */ +#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 05707b64aed8f5f1674b25334fb720d651459d5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:53 -0800 Subject: blk-mq: rename blk_mq_hw_ctx->queue_rq_srcu to ->srcu The RCU protection has been expanded to cover both queueing and completion paths making ->queue_rq_srcu a misnomer. Rename it to ->srcu as suggested by Bart. Signed-off-by: Tejun Heo Cc: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 460798dbac1f..8efcf49796a3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -66,7 +66,7 @@ struct blk_mq_hw_ctx { #endif /* Must be the last member - see also blk_mq_hw_ctx_size(). */ - struct srcu_struct queue_rq_srcu[0]; + struct srcu_struct srcu[0]; }; struct blk_mq_tag_set { -- cgit v1.2.3 From 9111e5686c8cf3905191d4feb819acd874437500 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Jan 2018 12:04:16 -0700 Subject: block: Provide blk_status_t decoding for path errors This patch provides a common decoder for block status path related errors that may be retried so various entities wishing to consult this do not have to duplicate this decision. Acked-by: Mike Snitzer Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1e628e032da..2d973ac54b09 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,6 +39,34 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_AGAIN ((__force blk_status_t)12) +/** + * blk_path_error - returns true if error may be path related + * @error: status the request was completed with + * + * Description: + * This classifies block error status into non-retryable errors and ones + * that may be successful if retried on a failover path. + * + * Return: + * %false - retrying failover path will not help + * %true - may succeed if retried + */ +static inline bool blk_path_error(blk_status_t error) +{ + switch (error) { + case BLK_STS_NOTSUPP: + case BLK_STS_NOSPC: + case BLK_STS_TARGET: + case BLK_STS_NEXUS: + case BLK_STS_MEDIUM: + case BLK_STS_PROTECTION: + return false; + } + + /* Anything else could be a path failure, so should be retried */ + return true; +} + struct blk_issue_stat { u64 stat; }; -- cgit v1.2.3 From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:30:56 -0700 Subject: block: remove REQ_ATOM_POLL_SLEPT We don't need this to be an atomic flag, it can be a regular flag. We either end up on the same CPU for the polling, in which case the state is sane, or we did the sleep which would imply the needed barrier to ensure we see the right state. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 007a7cf1f262..ba31674d8581 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* timeout is expired */ #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 14:23:42 -0700 Subject: block: add accessors for setting/querying request deadline We reduce the resolution of request expiry, but since we're already using jiffies for this where resolution depends on the kernel configuration and since the timeout resolution is coarse anyway, that should be fine. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba31674d8581..aa6698cf483c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -257,7 +257,9 @@ struct request { struct u64_stats_sync aborted_gstate_sync; u64 aborted_gstate; - unsigned long deadline; + /* access through blk_rq_set_deadline, blk_rq_deadline */ + unsigned long __deadline; + struct list_head timeout_list; /* -- cgit v1.2.3 From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:34:25 -0700 Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit We only have one atomic flag left. Instead of using an entire unsigned long for that, steal the bottom bit of the deadline field that we already reserved. Remove ->atomic_flags, since it's now unused. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aa6698cf483c..d4b2f7bb18d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -156,8 +156,6 @@ struct request { int internal_tag; - unsigned long atomic_flags; - /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ int tag; -- cgit v1.2.3 From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:46:39 -0700 Subject: block: rearrange a few request fields for better cache layout Move completion related items (like the call single data) near the end of the struct, instead of mixing them in with the initial queueing related fields. Move queuelist below the bio structures. Then we have all queueing related bits in the first cache line. This yields a 1.5-2% increase in IOPS for a null_blk test, both for sync and for high thread count access. Sync test goes form 975K to 992K, 32-thread case from 20.8M to 21.2M IOPS. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4b2f7bb18d6..71a9371c8182 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t; * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { - struct list_head queuelist; - union { - call_single_data_t csd; - u64 fifo_time; - }; - struct request_queue *q; struct blk_mq_ctx *mq_ctx; @@ -164,6 +158,8 @@ struct request { struct bio *bio; struct bio *biotail; + struct list_head queuelist; + /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used @@ -211,19 +207,16 @@ struct request { struct hd_struct *part; unsigned long start_time; struct blk_issue_stat issue_stat; -#ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; - unsigned long long io_start_time_ns; /* when passed to hardware */ -#endif /* Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; + #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif + unsigned short write_hint; unsigned short ioprio; unsigned int timeout; @@ -232,8 +225,6 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ - unsigned short write_hint; - /* * On blk-mq, the lower bits of ->gstate (generation number and * state) carry the MQ_RQ_* state value and the upper bits the @@ -260,6 +251,11 @@ struct request { struct list_head timeout_list; + union { + call_single_data_t csd; + u64 fifo_time; + }; + /* * completion callback. */ @@ -268,6 +264,12 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; + unsigned long long io_start_time_ns; /* when passed to hardware */ +#endif }; static inline bool blk_rq_is_scsi(struct request *rq) -- cgit v1.2.3 From fa70d2e2c4a0a54ced98260c6a176cc94c876d27 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Jan 2018 22:01:13 -0500 Subject: block: allow gendisk's request_queue registration to be deferred Since I can remember DM has forced the block layer to allow the allocation and initialization of the request_queue to be distinct operations. Reason for this is block/genhd.c:add_disk() has requires that the request_queue (and associated bdi) be tied to the gendisk before add_disk() is called -- because add_disk() also deals with exposing the request_queue via blk_register_queue(). DM's dynamic creation of arbitrary device types (and associated request_queue types) requires the DM device's gendisk be available so that DM table loads can establish a master/slave relationship with subordinate devices that are referenced by loaded DM tables -- using bd_link_disk_holder(). But until these DM tables, and their associated subordinate devices, are known DM cannot know what type of request_queue it needs -- nor what its queue_limits should be. This chicken and egg scenario has created all manner of problems for DM and, at times, the block layer. Summary of changes: - Add device_add_disk_no_queue_reg() and add_disk_no_queue_reg() variant that drivers may use to add a disk without also calling blk_register_queue(). Driver must call blk_register_queue() once its request_queue is fully initialized. - Return early from blk_unregister_queue() if QUEUE_FLAG_REGISTERED is not set. It won't be set if driver used add_disk_no_queue_reg() but driver encounters an error and must del_gendisk() before calling blk_register_queue(). - Export blk_register_queue(). These changes allow DM to use add_disk_no_queue_reg() to anchor its gendisk as the "master" for master/slave relationships DM must establish with subordinate devices referenced in DM tables that get loaded. Once all "slave" devices for a DM device are known its request_queue can be properly initialized and then advertised via sysfs -- important improvement being that no request_queue resource initialization performed by blk_register_queue() is missed for DM devices anymore. Signed-off-by: Mike Snitzer Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/genhd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5144ebe046c9..5e3531027b51 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk); } +extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); +static inline void add_disk_no_queue_reg(struct gendisk *disk) +{ + device_add_disk_no_queue_reg(NULL, disk); +} extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); -- cgit v1.2.3 From ddc212313f16cd65fcf5e8d9ae223f8374822e4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jan 2018 16:01:36 +0100 Subject: blkcg: simplify statistic accumulation code Some older compilers (gcc-4.4 through 4.6 in particular) struggle with the way that blkg_rwstat_read() returns a structure, leading to excessive stack usage and rather inefficient code: block/blk-cgroup.c: In function 'blkg_destroy': block/blk-cgroup.c:354:1: error: the frame size of 1296 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] block/cfq-iosched.c: In function 'cfqg_stats_add_aux': block/cfq-iosched.c:753:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] block/bfq-cgroup.c: In function 'bfqg_stats_add_aux': block/bfq-cgroup.c:299:1: error: the frame size of 1928 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] I also notice that there is no point in using atomic accesses for the local variables, so storing the temporaries in simple 'u64' variables not only avoids the stack usage on older compilers but also improves the object code on modern versions. Fixes: e6269c445467 ("blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it") Acked-by: Tejun Heo Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e9825ff57b15..69bea82ebeb1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { - struct blkg_rwstat v = blkg_rwstat_read(from); + u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(atomic64_read(&v.aux_cnt[i]) + - atomic64_read(&from->aux_cnt[i]), + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } -- cgit v1.2.3 From 88de4598bca84e27b261685c06fff816b8d932a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2017 14:50:00 +0100 Subject: nvme-pci: clean up SMBSZ bit definitions Define the bit positions instead of macros using the magic values, and move the expanded helpers to calculate the size and size unit into the implementation C file. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Logan Gunthorpe --- include/linux/nvme.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index aea87f0d917b..4112e2bd747f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -124,14 +124,20 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) -#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) -#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) - -#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) -#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) -#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) -#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) -#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +enum { + NVME_CMBSZ_SQS = 1 << 0, + NVME_CMBSZ_CQS = 1 << 1, + NVME_CMBSZ_LISTS = 1 << 2, + NVME_CMBSZ_RDS = 1 << 3, + NVME_CMBSZ_WDS = 1 << 4, + + NVME_CMBSZ_SZ_SHIFT = 12, + NVME_CMBSZ_SZ_MASK = 0xfffff, + + NVME_CMBSZ_SZU_SHIFT = 8, + NVME_CMBSZ_SZU_MASK = 0xf, +}; /* * Submission and Completion Queue Entry Sizes for the NVM command set. -- cgit v1.2.3 From 83d016ac86428dbca8a62d3e4fdc29e3ea39e535 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2018 11:48:08 -0800 Subject: block: Unexport elv_register_queue() and elv_unregister_queue() These two functions are only called from inside the block layer so unexport them. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 3d794b3dc532..6d9e230dffd2 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -extern int elv_register_queue(struct request_queue *q); -extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, -- cgit v1.2.3 From 8c7a8d1c4b9c30a2be3b31a2e6af1cefd45574eb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 11:00:54 -0800 Subject: lib/scatterlist: Fix chaining support in sgl_alloc_order() This patch avoids that workloads with large block sizes (megabytes) can trigger the following call stack with the ib_srpt driver (that driver is the only driver that chains scatterlists allocated by sgl_alloc_order()): BUG: Bad page state in process kworker/0:1H pfn:2423a78 page:fffffb03d08e9e00 count:-3 mapcount:0 mapping: (null) index:0x0 flags: 0x57ffffc0000000() raw: 0057ffffc0000000 0000000000000000 0000000000000000 fffffffdffffffff raw: dead000000000100 dead000000000200 0000000000000000 0000000000000000 page dumped because: nonzero _count CPU: 0 PID: 733 Comm: kworker/0:1H Tainted: G I 4.15.0-rc7.bart+ #1 Hardware name: HP ProLiant DL380 G7, BIOS P67 08/16/2015 Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] Call Trace: dump_stack+0x5c/0x83 bad_page+0xf5/0x10f get_page_from_freelist+0xa46/0x11b0 __alloc_pages_nodemask+0x103/0x290 sgl_alloc_order+0x101/0x180 target_alloc_sgl+0x2c/0x40 [target_core_mod] srpt_alloc_rw_ctxs+0x173/0x2d0 [ib_srpt] srpt_handle_new_iu+0x61e/0x7f0 [ib_srpt] __ib_process_cq+0x55/0xa0 [ib_core] ib_cq_poll_work+0x1b/0x60 [ib_core] process_one_work+0x141/0x340 worker_thread+0x47/0x3e0 kthread+0xf5/0x130 ret_from_fork+0x1f/0x30 Fixes: e80a0af4759a ("lib/scatterlist: Introduce sgl_alloc() and sgl_free()") Reported-by: Laurence Oberman Tested-by: Laurence Oberman Signed-off-by: Bart Van Assche Cc: Nicholas A. Bellinger Cc: Laurence Oberman Signed-off-by: Jens Axboe --- include/linux/scatterlist.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b8a7c1d1dbe3..22b2131bcdcd 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -282,6 +282,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length, gfp_t gfp, unsigned int *nent_p); struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p); +void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); void sgl_free_order(struct scatterlist *sgl, int order); void sgl_free(struct scatterlist *sgl); #endif /* CONFIG_SGL_ALLOC */ -- cgit v1.2.3 From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 08:58:56 -0800 Subject: block: Remove kblockd_schedule_delayed_work{,_on}() The previous patch removed all users of these two functions. Hence also remove the functions themselves. Reviewed-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 71a9371c8182..afc43fb63c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3