From 7759eb23fd9808a2e4498cf36a798ed65cde78ae Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 5 Sep 2018 15:45:54 -0600 Subject: block: remove bio_rewind_iter() It is pointed that bio_rewind_iter() is one very bad API[1]: 1) bio size may not be restored after rewinding 2) it causes some bogus change, such as 5151842b9d8732 (block: reset bi_iter.bi_done after splitting bio) 3) rewinding really makes things complicated wrt. bio splitting 4) unnecessary updating of .bi_done in fast path [1] https://marc.info/?t=153549924200005&r=1&w=2 So this patch takes Kent's suggestion to restore one bio into its original state via saving bio iterator(struct bvec_iter) in bio_integrity_prep(), given now bio_rewind_iter() is only used by bio integrity code. Cc: Dmitry Monakhov Cc: Hannes Reinecke Suggested-by: Kent Overstreet Acked-by: Kent Overstreet Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 22 ++++------------------ include/linux/bvec.h | 3 --- 2 files changed, 4 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 51371740d2a8..14b4fa266357 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -170,27 +170,11 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, { iter->bi_sector += bytes >> 9; - if (bio_no_advance_iter(bio)) { + if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; - iter->bi_done += bytes; - } else { + else bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ - } -} - -static inline bool bio_rewind_iter(struct bio *bio, struct bvec_iter *iter, - unsigned int bytes) -{ - iter->bi_sector -= bytes >> 9; - - if (bio_no_advance_iter(bio)) { - iter->bi_size += bytes; - iter->bi_done -= bytes; - return true; - } - - return bvec_iter_rewind(bio->bi_io_vec, iter, bytes); } #define __bio_for_each_segment(bvl, bio, iter, start) \ @@ -353,6 +337,8 @@ struct bio_integrity_payload { unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + struct bvec_iter bio_iter; /* for rewinding parent bio */ + struct work_struct bip_work; /* I/O completion */ struct bio_vec *bip_vec; diff --git a/include/linux/bvec.h b/include/linux/bvec.h index fe7a22dd133b..02c73c6aa805 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -40,8 +40,6 @@ struct bvec_iter { unsigned int bi_idx; /* current index into bvl_vec */ - unsigned int bi_done; /* number of bytes completed */ - unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ }; @@ -85,7 +83,6 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, bytes -= len; iter->bi_size -= len; iter->bi_bvec_done += len; - iter->bi_done += len; if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) { iter->bi_bvec_done = 0; -- cgit v1.2.3 From 27e6fa996c534c32702aa4d32db0ffa383acd050 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:26 -0400 Subject: blkcg: fix ref count issue with bio_blkcg using task_css The accessor function bio_blkcg either returns the blkcg associated with the bio or finds one in the current context. This can cause an issue when trying to associate a bio with a blkcg. Particularly, it's the third case that is problematic: return css_to_blkcg(task_css(current, io_cgrp_id)); As the above may race against task migration and the cgroup exiting, it is not always ok to take a reference on the blkcg returned from bio_blkcg. This patch adds association ahead of calling bio_blkcg rather than after. This makes association a required and explicit step along the code paths for calling bio_blkcg. blk_get_rl is modified as well to get a reference to the blkcg it may use and blk_put_rl will always put the reference back. Association is also moved above the bio_blkcg call to ensure it will not return NULL in blk-iolatency. BFQ and CFQ utilize this flaw, but due to the complexity, I do not want to address this in this series. I've created a private version of the function with notes not to use it describing the flaw. Hopefully soon, that code can be cleaned up. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 101 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6d766a19f2bb..24067a1f8b36 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -230,22 +230,100 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static inline struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + +/** + * blkcg_get_css - find and get a reference to the css + * + * Find the css associated with either the kthread or the current task. + * This takes a reference on the blkcg which will need to be managed by the + * caller. + */ +static inline struct cgroup_subsys_state *blkcg_get_css(void) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + css = kthread_blkcg(); + if (css) { + css_get(css); + } else { + /* + * This is a bit complicated. It is possible task_css is seeing + * an old css pointer here. This is caused by the current + * thread migrating away from this cgroup and this cgroup dying. + * css_tryget() will fail when trying to take a ref on a cgroup + * that's ref count has hit 0. + * + * Therefore, if it does fail, this means current must have + * been swapped away already and this is waiting for it to + * propagate on the polling cpu. Hence the use of cpu_relax(). + */ + while (true) { + css = task_css(current, io_cgrp_id); + if (likely(css_tryget(css))) + break; + cpu_relax(); + } + } + + rcu_read_unlock(); + + return css; +} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *bio_blkcg(struct bio *bio) +/** + * __bio_blkcg - internal version of bio_blkcg for bfq and cfq + * + * DO NOT USE. + * There is a flaw using this version of the function. In particular, this was + * used in a broken paradigm where association was called on the given css. It + * is possible though that the returned css from task_css() is in the process + * of dying due to migration of the current task. So it is improper to assume + * *_get() is going to succeed. Both BFQ and CFQ rely on this logic and will + * take additional work to handle more gracefully. + */ +static inline struct blkcg *__bio_blkcg(struct bio *bio) { - struct cgroup_subsys_state *css; + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + return css_to_blkcg(blkcg_css()); +} +/** + * bio_blkcg - grab the blkcg associated with a bio + * @bio: target bio + * + * This returns the blkcg associated with a bio, NULL if not associated. + * Callers are expected to either handle NULL or know association has been + * done prior to calling this. + */ +static inline struct blkcg *bio_blkcg(struct bio *bio) +{ if (bio && bio->bi_css) return css_to_blkcg(bio->bi_css); - css = kthread_blkcg(); - if (css) - return css_to_blkcg(css); - return css_to_blkcg(task_css(current, io_cgrp_id)); + return NULL; } static inline bool blk_cgroup_congested(void) @@ -534,6 +612,10 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); blkcg = bio_blkcg(bio); + if (blkcg) + css_get(&blkcg->css); + else + blkcg = css_to_blkcg(blkcg_get_css()); /* bypass blkg lookup and use @q->root_rl directly for root */ if (blkcg == &blkcg_root) @@ -565,6 +647,8 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { + /* an additional ref is always taken for rl */ + css_put(&rl->blkg->blkcg->css); if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } @@ -805,10 +889,10 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, bool throtl = false; rcu_read_lock(); - blkcg = bio_blkcg(bio); /* associate blkcg if bio hasn't attached one */ - bio_associate_blkcg(bio, &blkcg->css); + bio_associate_blkcg(bio, NULL); + blkcg = bio_blkcg(bio); blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { @@ -930,6 +1014,7 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } +static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, -- cgit v1.2.3 From 49f4c2dc2b5066e9211101c59cc0828e81d41614 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:27 -0400 Subject: blkcg: update blkg_lookup_create to do locking To know when to create a blkg, the general pattern is to do a blkg_lookup and if that fails, lock and then do a lookup again and if that fails finally create. It doesn't make much sense for everyone who wants to do creation to write this themselves. This changes blkg_lookup_create to do locking and implement this pattern. The old blkg_lookup_create is renamed to __blkg_lookup_create. If a call site wants to do its own error handling or already owns the queue lock, they can use __blkg_lookup_create. This will be used in upcoming patches. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Reviewed-by: Liu Bo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 24067a1f8b36..cc0f238530f6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -184,6 +184,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -897,7 +899,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); + blkg = __blkg_lookup_create(blkcg, q); if (IS_ERR(blkg)) blkg = NULL; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From 07b05bcc3213ac9f8c28c9d835b4bf3d5798cc60 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:28 -0400 Subject: blkcg: convert blkg_lookup_create to find closest blkg There are several scenarios where blkg_lookup_create can fail. Examples include the blkcg dying, request_queue is dying, or simply being OOM. At the end of the day, most handle this by simply falling back to the q->root_blkg and calling it a day. This patch implements the notion of closest blkg. During blkg_lookup_create, if it fails to create, return the closest blkg found or the q->root_blkg. blkg_try_get_closest is introduced and used during association so a bio is always attached to a blkg. Acked-by: Tejun Heo Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index cc0f238530f6..1fbff1bbb651 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -549,6 +549,20 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) return NULL; } +/** + * blkg_try_get_closest - try and get a blkg ref on the closet blkg + * @blkg: blkg to get + * + * This walks up the blkg tree to find the closest non-dying blkg and returns + * the blkg that it did association with as it may not be the passed in blkg. + */ +static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg) +{ + while (!atomic_inc_not_zero(&blkg->refcnt)) + blkg = blkg->parent; + + return blkg; +} void __blkg_release_rcu(struct rcu_head *rcu); -- cgit v1.2.3 From a7b39b4e961c4e2b3ed837803a7441a65c90ce33 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:29 -0400 Subject: blkcg: always associate a bio with a blkg Previously, blkg's were only assigned as needed by blk-iolatency and blk-throttle. bio->css was also always being associated while blkg was being looked up and then thrown away in blkcg_bio_issue_check. This patch begins the cleanup of bio->css and bio->bi_blkg by always associating a blkg in blkcg_bio_issue_check. This tries to create the blkg, but if it is not possible, falls back to using the root_blkg of the request_queue. Therefore, a bio will always be associated with a blkg. The duplicate association logic is removed from blk-throttle and blk-iolatency. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +++ include/linux/blk-cgroup.h | 16 ++-------------- 2 files changed, 5 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 14b4fa266357..829cd0bb407d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -542,11 +542,14 @@ static inline int bio_associate_blkcg_from_page(struct bio *bio, #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); +int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } +static inline int bio_associate_create_blkg(struct request_queue *q, + struct bio *bio) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1fbff1bbb651..6e33ad1d92b4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -900,29 +900,17 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { - struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); - /* associate blkcg if bio hasn't attached one */ - bio_associate_blkcg(bio, NULL); - blkcg = bio_blkcg(bio); - - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - spin_lock_irq(q->queue_lock); - blkg = __blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - spin_unlock_irq(q->queue_lock); - } + bio_associate_create_blkg(q, bio); + blkg = bio->bi_blkg; throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { - blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the -- cgit v1.2.3 From 5bf9a1f3b4efef7e463105dde8bba4d2397909c2 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:30 -0400 Subject: blkcg: consolidate bio_issue_init to be a part of core bio_issue_init among other things initializes the timestamp for an IO. Rather than have this logic handled by policies, this consolidates it to be on the init paths (normal, clone, bounce clone). Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Reviewed-by: Liu Bo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6e33ad1d92b4..a6b6e741a75e 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -897,6 +897,12 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif + +static inline void blkcg_bio_issue_init(struct bio *bio) +{ + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +} + static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { @@ -922,6 +928,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } + blkcg_bio_issue_init(bio); + rcu_read_unlock(); return !throtl; } @@ -1034,6 +1042,7 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } -- cgit v1.2.3 From 74b7c02a9bc124ee3df0d77880ee26db0a325516 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:31 -0400 Subject: blkcg: associate a blkg for pages being evicted by swap A prior patch in this series added blkg association to bios issued by cgroups. There are two other paths that we want to attribute work back to the appropriate cgroup: swap and writeback. Here we modify the way swap tags bios to include the blkg. Writeback will be tackle in the next patch. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 829cd0bb407d..c73a870ebc0e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -533,21 +533,26 @@ do { \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +int bio_associate_blkg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkcg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline int bio_associate_blkg_from_page(struct bio *bio, + struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); +int bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css); int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } +static inline int bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ return 0; } static inline int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -- cgit v1.2.3 From bdc2491708c47601603918a9a20acddef6e1d814 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:32 -0400 Subject: blkcg: associate writeback bios with a blkg One of the goals of this series is to remove a separate reference to the css of the bio. This can and should be accessed via bio_blkcg. In this patch, the wbc_init_bio call is changed such that it must be called after a queue has been associated with the bio. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/writeback.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fdfd04e348f6..738a0c24874f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. + * writeback context. Must be called after the bio has been associated with + * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkcg(bio, wbc->wb->blkcg_css); + bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ -- cgit v1.2.3 From c839e7a03f92bafd71fd145b470dcdc7f43f2d4c Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:33 -0400 Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg Prior patches ensured that all bios are now associated with some blkg. This now makes bio->bi_css unnecessary as blkg maintains a reference to the blkcg already. This patch removes the field bi_css and transfers corresponding uses to access via bi_blkg. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +++------ include/linux/blk-cgroup.h | 8 ++++---- include/linux/blk_types.h | 1 - 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index c73a870ebc0e..e973876625a8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -540,24 +540,21 @@ static inline int bio_associate_blkg_from_page(struct bio *bio, #endif #ifdef CONFIG_BLK_CGROUP -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); int bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkcg_association(struct bio *dst, struct bio *src); +void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkcg(struct bio *bio, - struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { return 0; } static inline int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkcg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a6b6e741a75e..c41cfcc2b4d8 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -308,8 +308,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) */ static inline struct blkcg *__bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return css_to_blkcg(blkcg_css()); } @@ -323,8 +323,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio) */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return NULL; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f6dfb30737d8..9578c7ab1eb6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,7 +178,6 @@ struct bio { * release. Read comment on top of bio_associate_current(). */ struct io_context *bi_ioc; - struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif -- cgit v1.2.3 From f0fcb3ec89f37167810e660b0595d9a6155d9807 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:34 -0400 Subject: blkcg: remove additional reference to the css The previous patch in this series removed carrying around a pointer to the css in blkg. However, the blkg association logic still relied on taking a reference on the css to ensure we wouldn't fail in getting a reference for the blkg. Here the implicit dependency on the css is removed. The association continues to rely on the tryget logic walking up the blkg tree. This streamlines the three ways that association can happen: normal, swap, and writeback. Acked-by: Tejun Heo Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 52 ++++------------------------------------------ include/linux/cgroup.h | 2 ++ 2 files changed, 6 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index c41cfcc2b4d8..2951ea3541b1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -249,47 +249,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -/** - * blkcg_get_css - find and get a reference to the css - * - * Find the css associated with either the kthread or the current task. - * This takes a reference on the blkcg which will need to be managed by the - * caller. - */ -static inline struct cgroup_subsys_state *blkcg_get_css(void) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - css = kthread_blkcg(); - if (css) { - css_get(css); - } else { - /* - * This is a bit complicated. It is possible task_css is seeing - * an old css pointer here. This is caused by the current - * thread migrating away from this cgroup and this cgroup dying. - * css_tryget() will fail when trying to take a ref on a cgroup - * that's ref count has hit 0. - * - * Therefore, if it does fail, this means current must have - * been swapped away already and this is waiting for it to - * propagate on the polling cpu. Hence the use of cpu_relax(). - */ - while (true) { - css = task_css(current, io_cgrp_id); - if (likely(css_tryget(css))) - break; - cpu_relax(); - } - } - - rcu_read_unlock(); - - return css; -} - static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; @@ -628,10 +587,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); blkcg = bio_blkcg(bio); - if (blkcg) - css_get(&blkcg->css); - else - blkcg = css_to_blkcg(blkcg_get_css()); + if (!blkcg) + blkcg = css_to_blkcg(blkcg_css()); /* bypass blkg lookup and use @q->root_rl directly for root */ if (blkcg == &blkcg_root) @@ -646,7 +603,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, if (unlikely(!blkg)) goto root_rl; - blkg_get(blkg); + if (!blkg_try_get(blkg)) + goto root_rl; rcu_read_unlock(); return &blkg->rl; root_rl: @@ -663,8 +621,6 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* an additional ref is always taken for rl */ - css_put(&rl->blkg->blkcg->css); if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 32c553556bbd..b8bcbdeb2eac 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,6 +93,8 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, + struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, -- cgit v1.2.3 From e2b0989954ae7c80609f77e7ce203bea6d2c54e1 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:35 -0400 Subject: blkcg: cleanup and make blk_get_rl use blkg_lookup_create blk_get_rl is responsible for identifying which request_list a request should be allocated to. Try get logic was added earlier, but semantically the logic was not changed. This patch makes better use of the bio already having a reference to the blkg in the hot path. The cold path uses a better fallback of blkg_lookup_create rather than just blkg_lookup and then falling back to the q->root_rl. If lookup_create fails with anything but -ENODEV, it falls back to q->root_rl. A clarifying comment is added to explain why q->root_rl is used rather than the root blkg's rl. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 2951ea3541b1..d2f7f1b00fcf 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -586,28 +586,36 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); - blkcg = bio_blkcg(bio); - if (!blkcg) - blkcg = css_to_blkcg(blkcg_css()); + if (bio && bio->bi_blkg) { + blkcg = bio->bi_blkg->blkcg; + if (blkcg == &blkcg_root) + goto rl_use_root; + + blkg_get(bio->bi_blkg); + rcu_read_unlock(); + return &bio->bi_blkg->rl; + } - /* bypass blkg lookup and use @q->root_rl directly for root */ + blkcg = css_to_blkcg(blkcg_css()); if (blkcg == &blkcg_root) - goto root_rl; + goto rl_use_root; - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) - goto root_rl; + blkg = __blkg_lookup_create(blkcg, q); if (!blkg_try_get(blkg)) - goto root_rl; + goto rl_use_root; + rcu_read_unlock(); return &blkg->rl; -root_rl: + + /* + * Each blkg has its own request_list, however, the root blkcg + * uses the request_queue's root_rl. This is to avoid most + * overhead for the root blkcg. + */ +rl_use_root: rcu_read_unlock(); return &q->root_rl; } -- cgit v1.2.3 From b3b9f24f5fcc099c41f7dc1d02350635830888e5 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:36 -0400 Subject: blkcg: change blkg reference counting to use percpu_ref Now that every bio is associated with a blkg, this puts the use of blkg_get, blkg_try_get, and blkg_put on the hot path. This switches over the refcnt in blkg to use percpu_ref. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index d2f7f1b00fcf..7ff5d8ba8c7a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -126,7 +126,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - atomic_t refcnt; + struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -490,8 +490,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); + percpu_ref_get(&blkg->refcnt); } /** @@ -503,7 +502,7 @@ static inline void blkg_get(struct blkcg_gq *blkg) */ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) { - if (atomic_inc_not_zero(&blkg->refcnt)) + if (percpu_ref_tryget(&blkg->refcnt)) return blkg; return NULL; } @@ -517,23 +516,19 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) */ static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg) { - while (!atomic_inc_not_zero(&blkg->refcnt)) + while (!percpu_ref_tryget(&blkg->refcnt)) blkg = blkg->parent; return blkg; } -void __blkg_release_rcu(struct rcu_head *rcu); - /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); + percpu_ref_put(&blkg->refcnt); } /** -- cgit v1.2.3 From 101246ec02b54adf6a77180a01ccbe310add2c32 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 11 Sep 2018 14:41:37 -0400 Subject: blkcg: rename blkg_try_get to blkg_tryget blkg reference counting now uses percpu_ref rather than atomic_t. Let's make this consistent with css_tryget. This renames blkg_try_get to blkg_tryget and now returns a bool rather than the blkg or NULL. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7ff5d8ba8c7a..b7fd08013de2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -494,27 +494,25 @@ static inline void blkg_get(struct blkcg_gq *blkg) } /** - * blkg_try_get - try and get a blkg reference + * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +static inline bool blkg_tryget(struct blkcg_gq *blkg) { - if (percpu_ref_tryget(&blkg->refcnt)) - return blkg; - return NULL; + return percpu_ref_tryget(&blkg->refcnt); } /** - * blkg_try_get_closest - try and get a blkg ref on the closet blkg + * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @blkg: blkg to get * * This walks up the blkg tree to find the closest non-dying blkg and returns * the blkg that it did association with as it may not be the passed in blkg. */ -static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg) +static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) { while (!percpu_ref_tryget(&blkg->refcnt)) blkg = blkg->parent; @@ -599,7 +597,7 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, if (unlikely(!blkg)) blkg = __blkg_lookup_create(blkcg, q); - if (!blkg_try_get(blkg)) + if (!blkg_tryget(blkg)) goto rl_use_root; rcu_read_unlock(); -- cgit v1.2.3 From 43b729bfe9cf30ad11499a66e3b7bd300c716d44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:47 +0200 Subject: block: move integrity_req_gap_{back,front}_merge to blk.h No need to expose these to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d6869e0e2b64..bc534c857344 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1843,26 +1843,6 @@ queue_max_integrity_segments(struct request_queue *q) return q->limits.max_integrity_segments; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - struct bio_integrity_payload *bip = bio_integrity(req->bio); - struct bio_integrity_payload *bip_next = bio_integrity(next); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - -static inline bool integrity_req_gap_front_merge(struct request *req, - struct bio *bio) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_integrity_payload *bip_next = bio_integrity(req->bio); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - /** * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi: blk_integrity profile for device @@ -1947,17 +1927,6 @@ static inline bool blk_integrity_merge_bio(struct request_queue *rq, return true; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - return false; -} -static inline bool integrity_req_gap_front_merge(struct request *req, - struct bio *bio) -{ - return false; -} - static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { -- cgit v1.2.3 From e9907009cbfc0c93d987d5a8fdf3d6c3c7b89717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:48 +0200 Subject: block: move req_gap_{back,front}_merge to blk-merge.c Keep it close to the actual users instead of exposing the function to all drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 69 -------------------------------------------------- 1 file changed, 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc534c857344..b7e676bb01bc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1695,75 +1695,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -/* - * Check if the two bvecs from two bios can be merged to one segment. - * If yes, no need to check gap between the two bios since the 1st bio - * and the 1st bvec in the 2nd bio can be handled in one segment. - */ -static inline bool bios_segs_mergeable(struct request_queue *q, - struct bio *prev, struct bio_vec *prev_last_bv, - struct bio_vec *next_first_bv) -{ - if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) - return false; - if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) - return false; - if (prev->bi_seg_back_size + next_first_bv->bv_len > - queue_max_segment_size(q)) - return false; - return true; -} - -static inline bool bio_will_gap(struct request_queue *q, - struct request *prev_rq, - struct bio *prev, - struct bio *next) -{ - if (bio_has_data(prev) && queue_virt_boundary(q)) { - struct bio_vec pb, nb; - - /* - * don't merge if the 1st bio starts with non-zero - * offset, otherwise it is quite difficult to respect - * sg gap limit. We work hard to merge a huge number of small - * single bios in case of mkfs. - */ - if (prev_rq) - bio_get_first_bvec(prev_rq->bio, &pb); - else - bio_get_first_bvec(prev, &pb); - if (pb.bv_offset) - return true; - - /* - * We don't need to worry about the situation that the - * merged segment ends in unaligned virt boundary: - * - * - if 'pb' ends aligned, the merged segment ends aligned - * - if 'pb' ends unaligned, the next bio must include - * one single bvec of 'nb', otherwise the 'nb' can't - * merge with 'pb' - */ - bio_get_last_bvec(prev, &pb); - bio_get_first_bvec(next, &nb); - - if (!bios_segs_mergeable(q, prev, &pb, &nb)) - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); - } - - return false; -} - -static inline bool req_gap_back_merge(struct request *req, struct bio *bio) -{ - return bio_will_gap(req->q, req, req->biotail, bio); -} - -static inline bool req_gap_front_merge(struct request *req, struct bio *bio) -{ - return bio_will_gap(req->q, NULL, bio, req->bio); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); -- cgit v1.2.3 From 27ca1d4ed04ea29dc77b47190a3cc82697023e76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:49 +0200 Subject: block: move req_gap_back_merge to blk.h No need to expose these helpers outside the block layer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7e676bb01bc..1d5e14139795 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1676,25 +1676,6 @@ static inline void put_dev_sector(Sector p) put_page(p.v); } -static inline bool __bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - return offset || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); -} - -/* - * Check if adding a bio_vec after bprv with offset would create a gap in - * the SG list. Most drivers don't care about this, but some do. - */ -static inline bool bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - if (!queue_virt_boundary(q)) - return false; - return __bvec_gap_to_prev(q, bprv, offset); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); -- cgit v1.2.3 From 6a9f5f240adfdced863a098d34f8f05ca6ab9d5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:50 +0200 Subject: block: simplify BIOVEC_PHYS_MERGEABLE Turn the macro into an inline, move it to blk.h and simplify the arch hooks a bit. Also rename the function to biovec_phys_mergeable as there is no need to shout. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index e973876625a8..e2adb96346f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -140,19 +140,6 @@ static inline bool bio_full(struct bio *bio) /* * merge helpers etc */ - -/* Default implementation of BIOVEC_PHYS_MERGEABLE */ -#define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) - -/* - * allow arch override, for eg virtualized architectures (put in asm/io.h) - */ -#ifndef BIOVEC_PHYS_MERGEABLE -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - __BIOVEC_PHYS_MERGEABLE(vec1, vec2) -#endif - #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ -- cgit v1.2.3 From 3dccdae54fe836a22cee9dc6df9fd1708ae075ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:52 +0200 Subject: block: merge BIOVEC_SEG_BOUNDARY into biovec_phys_mergeable These two checks should always be performed together, so merge them into a single helper. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index e2adb96346f0..9b580e1cb2e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -137,14 +137,6 @@ static inline bool bio_full(struct bio *bio) */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) -/* - * merge helpers etc - */ -#define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ - (((addr1) | (mask)) == (((addr2) - 1) | (mask))) -#define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ - __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q))) - /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it -- cgit v1.2.3 From 6e768461c215eaf8912e6c23e40fdff1cd962aca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:53 +0200 Subject: block: remove bvec_to_phys We only use it in biovec_phys_mergeable and a m68k paravirt driver, so just opencode it there. Also remove the pointless unsigned long cast for the offset in the opencoded instances. Signed-off-by: Christoph Hellwig Reviewed-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9b580e1cb2e8..9ad4b0a487a4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -132,11 +132,6 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } -/* - * will die - */ -#define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) - /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it -- cgit v1.2.3 From bceacbfa48bf132d8d88058904cf5859eeb4f3e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:54 +0200 Subject: block: don't include io.h from bio.h Now that we don't need an override for BIOVEC_PHYS_MERGEABLE there is no need to drag this header in. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ad4b0a487a4..b3d47862b1b4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -24,9 +24,6 @@ #include #ifdef CONFIG_BLOCK - -#include - /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include -- cgit v1.2.3 From 65969e5cb249b765cee261c8f0458ec778b0c22f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Sep 2018 09:43:55 +0200 Subject: block: don't include bug.h from bio.h No need to pull in the BUG() defintion. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b3d47862b1b4..f447b0ebb288 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -21,7 +21,6 @@ #include #include #include -#include #ifdef CONFIG_BLOCK /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ -- cgit v1.2.3 From bca6b067b0b269a7b8ba129e2a918309ca8b4a55 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 26 Sep 2018 14:01:03 -0700 Subject: block: Move power management code into a new source file Move the code for runtime power management from blk-core.c into the new source file blk-pm.c. Move the corresponding declarations from into . For CONFIG_PM=n, leave out the declarations of the functions that are not used in that mode. This patch not only reduces the number of #ifdefs in the block layer core code but also reduces the size of header file and hence should help to reduce the build time of the Linux kernel if CONFIG_PM is not defined. Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Jianchao Wang Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Alan Stern Signed-off-by: Jens Axboe --- include/linux/blk-pm.h | 24 ++++++++++++++++++++++++ include/linux/blkdev.h | 23 ----------------------- 2 files changed, 24 insertions(+), 23 deletions(-) create mode 100644 include/linux/blk-pm.h (limited to 'include/linux') diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h new file mode 100644 index 000000000000..b80c65aba249 --- /dev/null +++ b/include/linux/blk-pm.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLK_PM_H_ +#define _BLK_PM_H_ + +struct device; +struct request_queue; + +/* + * block layer runtime pm functions + */ +#ifdef CONFIG_PM +extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); +extern int blk_pre_runtime_suspend(struct request_queue *q); +extern void blk_post_runtime_suspend(struct request_queue *q, int err); +extern void blk_pre_runtime_resume(struct request_queue *q); +extern void blk_post_runtime_resume(struct request_queue *q, int err); +extern void blk_set_runtime_active(struct request_queue *q); +#else +static inline void blk_pm_runtime_init(struct request_queue *q, + struct device *dev) {} +#endif + +#endif /* _BLK_PM_H_ */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d5e14139795..cd863511dedb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1280,29 +1280,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); -/* - * block layer runtime pm functions - */ -#ifdef CONFIG_PM -extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev); -extern int blk_pre_runtime_suspend(struct request_queue *q); -extern void blk_post_runtime_suspend(struct request_queue *q, int err); -extern void blk_pre_runtime_resume(struct request_queue *q); -extern void blk_post_runtime_resume(struct request_queue *q, int err); -extern void blk_set_runtime_active(struct request_queue *q); -#else -static inline void blk_pm_runtime_init(struct request_queue *q, - struct device *dev) {} -static inline int blk_pre_runtime_suspend(struct request_queue *q) -{ - return -ENOSYS; -} -static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} -static inline void blk_pre_runtime_resume(struct request_queue *q) {} -static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -static inline void blk_set_runtime_active(struct request_queue *q) {} -#endif - /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests -- cgit v1.2.3 From cd84a62e0078dce09f4ed349bec84f86c9d54b30 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 26 Sep 2018 14:01:04 -0700 Subject: block, scsi: Change the preempt-only flag into a counter The RQF_PREEMPT flag is used for three purposes: - In the SCSI core, for making sure that power management requests are executed even if a device is in the "quiesced" state. - For domain validation by SCSI drivers that use the parallel port. - In the IDE driver, for IDE preempt requests. Rename "preempt-only" into "pm-only" because the primary purpose of this mode is power management. Since the power management core may but does not have to resume a runtime suspended device before performing system-wide suspend and since a later patch will set "pm-only" mode as long as a block device is runtime suspended, make it possible to set "pm-only" mode from more than one context. Since with this change scsi_device_quiesce() is no longer idempotent, make that function return early if it is called for a quiesced queue. Signed-off-by: Bart Van Assche Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Cc: Jianchao Wang Cc: Johannes Thumshirn Cc: Alan Stern Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cd863511dedb..13bb54f26736 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -504,6 +504,12 @@ struct request_queue { * various queue flags, see QUEUE_* below */ unsigned long queue_flags; + /* + * Number of contexts that have called blk_set_pm_only(). If this + * counter is above zero then only RQF_PM and RQF_PREEMPT requests are + * processed. + */ + atomic_t pm_only; /* * ida allocated id for this queue. Used to index queues from @@ -698,7 +704,6 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ -#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -736,12 +741,11 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) -#define blk_queue_preempt_only(q) \ - test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) +#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) -extern int blk_set_preempt_only(struct request_queue *q); -extern void blk_clear_preempt_only(struct request_queue *q); +extern void blk_set_pm_only(struct request_queue *q); +extern void blk_clear_pm_only(struct request_queue *q); static inline int queue_in_flight(struct request_queue *q) { -- cgit v1.2.3 From 18c9a6bbe0645a05172a900740b9d2d379d54320 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 26 Sep 2018 14:01:07 -0700 Subject: percpu-refcount: Introduce percpu_ref_resurrect() This function will be used in a later patch to switch the struct request_queue q_usage_counter from killed back to live. In contrast to percpu_ref_reinit(), this new function does not require that the refcount is zero. Signed-off-by: Bart Van Assche Acked-by: Tejun Heo Reviewed-by: Ming Lei Cc: Christoph Hellwig Cc: Jianchao Wang Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/percpu-refcount.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 009cdf3d65b6..b297cd1cd4f1 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -108,6 +108,7 @@ void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); +void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); /** -- cgit v1.2.3 From ed88660a5372faa67c168c3db5201e33e488c9fd Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 27 Sep 2018 15:55:51 -0700 Subject: block: move call of scheduler's ->completed_request() hook Commit 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()") consolidated some calls using ktime_get() so we'd only need to call it once. Kyber's ->completed_request() hook also calls ktime_get(), so let's move it to the same place, too. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index a02deea30185..015bb59c0331 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -111,7 +111,7 @@ struct elevator_mq_ops { void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct request *); + void (*completed_request)(struct request *, u64); void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); -- cgit v1.2.3 From fef912bf860e8e7e48a2bfb978a356bba743a8b7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 28 Sep 2018 08:17:19 +0200 Subject: block: genhd: add 'groups' argument to device_add_disk Update device_add_disk() to take an 'groups' argument so that individual drivers can register a device with additional sysfs attributes. This avoids race condition the driver would otherwise have if these groups were to be created with sysfs_add_groups(). Signed-off-by: Martin Wilck Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/genhd.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 57864422a2c8..0b820ff05839 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -399,10 +399,11 @@ static inline void free_part_info(struct hd_struct *part) extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); /* block/genhd.c */ -extern void device_add_disk(struct device *parent, struct gendisk *disk); +extern void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); static inline void add_disk(struct gendisk *disk) { - device_add_disk(NULL, disk); + device_add_disk(NULL, disk, NULL); } extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); static inline void add_disk_no_queue_reg(struct gendisk *disk) -- cgit v1.2.3 From 783f4a4408e1251d17f333ad56abac24dde988b9 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 27 Sep 2018 16:58:54 -0700 Subject: nvme: call nvme_complete_rq when nvmf_check_ready fails for mpath I/O When an io is rejected by nvmf_check_ready() due to validation of the controller state, the nvmf_fail_nonready_command() will normally return BLK_STS_RESOURCE to requeue and retry. However, if the controller is dying or the I/O is marked for NVMe multipath, the I/O is failed so that the controller can terminate or so that the io can be issued on a different path. Unfortunately, as this reject point is before the transport has accepted the command, blk-mq ends up completing the I/O and never calls nvme_complete_rq(), which is where multipath may preserve or re-route the I/O. The end result is, the device user ends up seeing an EIO error. Example: single path connectivity, controller is under load, and a reset is induced. An I/O is received: a) while the reset state has been set but the queues have yet to be stopped; or b) after queues are started (at end of reset) but before the reconnect has completed. The I/O finishes with an EIO status. This patch makes the following changes: - Adds the HOST_PATH_ERROR pathing status from TP4028 - Modifies the reject point such that it appears to queue successfully, but actually completes the io with the new pathing status and calls nvme_complete_rq(). - nvme_complete_rq() recognizes the new status, avoids resetting the controller (likely was already done in order to get this new status), and calls the multipather to clear the current path that errored. This allows the next command (retry or new command) to select a new path if there is one. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 68e91ef5494c..818dbe9331be 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1241,6 +1241,7 @@ enum { NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_DNR = 0x4000, }; -- cgit v1.2.3 From d7b6801673f95e5f72efd3ffba1bcbb606883049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 9 Oct 2018 13:11:32 +0200 Subject: lightnvm: combine 1.2 and 2.0 command flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add nvm_set_flags helper to enable core to appropriately set the command flags for read/write/erase depending on which version a drive supports. The flags arguments can be distilled into the access hint, scrambling, and program/erase suspend. Replace the access hint with a "is_seq" parameter. The rest of the flags are dependent on the command opcode, which is trivial to detect and set. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e9e0d1c7eaf5..8acc2fe277d6 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -305,6 +305,8 @@ struct nvm_rq { u64 ppa_status; /* ppa media status */ int error; + int is_seq; /* Sequential hint flag. 1.2 only */ + void *private; }; -- cgit v1.2.3 From 656e33ca3d405196f94133babc4e38454a49cb73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 9 Oct 2018 13:11:34 +0200 Subject: lightnvm: move device L2P detection to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 1.2 device is able to manage the logical to physical mapping table internally or leave it to the host. A target only supports one of those approaches, and therefore must check on initialization. Move this check to core to avoid each target implement the check. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 8acc2fe277d6..f4a84694e5e2 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -495,9 +495,15 @@ typedef void (nvm_tgt_exit_fn)(void *, bool); typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); +enum { + NVM_TGT_F_DEV_L2P = 0, + NVM_TGT_F_HOST_L2P = 1 << 0, +}; + struct nvm_tgt_type { const char *name; unsigned int version[3]; + int flags; /* target entry points */ nvm_tgt_make_rq_fn *make_rq; -- cgit v1.2.3 From aff3fb18f957de93e629c7d3d2c4ef1f360aa511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 9 Oct 2018 13:11:36 +0200 Subject: lightnvm: move bad block and chunk state logic to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk implements two data paths for recovery line state. One for 1.2 and another for 2.0, instead of having pblk implement these, combine them in the core to reduce complexity and make available to other targets. The new interface will adhere to the 2.0 chunk definition, including managing open chunks with an active write pointer. To provide this interface, a 1.2 device recovers the state of the chunks by manually detecting if a chunk is either free/open/close/offline, and if open, scanning the flash pages sequentially to find the next writeable page. This process takes on average ~10 seconds on a device with 64 dies, 1024 blocks and 60us read access time. The process can be parallelized but is left out for maintenance simplicity, as the 1.2 specification is deprecated. For 2.0 devices, the logic is maintained internally in the drive and retrieved through the 2.0 interface. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index f4a84694e5e2..0106984400bc 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -86,8 +86,8 @@ struct nvm_chk_meta; typedef int (nvm_id_fn)(struct nvm_dev *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); -typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, struct nvm_chk_meta *, - sector_t, int); +typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, + struct nvm_chk_meta *); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); @@ -532,18 +532,13 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); - -extern int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, - struct nvm_chk_meta *meta, struct ppa_addr ppa, - int nchks); - -extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, +extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr, + int, struct nvm_chk_meta *); +extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); extern void nvm_end_io(struct nvm_rq *); -extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); -extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 2cf99bbd106f89fc72f778e8ad9d5538f1ef939b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 9 Oct 2018 13:11:41 +0200 Subject: lightnvm: pblk: add helpers for chunk addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement helpers to go from ppas to a chunk within a line and an address within a chunk. These helpers will be used on the patches adding trace support in pblk, which will be sent in this window. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 0106984400bc..77743a02ec0d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -487,6 +487,25 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, return l; } +static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf, + struct ppa_addr p) +{ + struct nvm_geo *geo = &dev->geo; + u64 caddr; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)addrf; + + caddr = (u64)p.g.pg << ppaf->pg_offset; + caddr |= (u64)p.g.pl << ppaf->pln_offset; + caddr |= (u64)p.g.sec << ppaf->sec_offset; + } else { + caddr = p.m.sec; + } + + return caddr; +} + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, -- cgit v1.2.3 From d68a9344041b6dd304ff382d0c7805869f09944f Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Tue, 9 Oct 2018 13:11:46 +0200 Subject: lightnvm: introduce nvm_rq_to_ppa_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a number of places in the lightnvm subsystem where the user iterates over the ppa list. Before iterating, the user must know if it is a single or multiple LBAs due to vector commands using either the nvm_rq ->ppa_addr or ->ppa_list fields on command submission, which leads to open-coding the if/else statement. Instead of having multiple if/else's, move it into a function that can be called by its users. A nice side effect of this cleanup is that this patch fixes up a bunch of cases where we don't consider the single-ppa case in pblk. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 77743a02ec0d..50ac5b21297c 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -320,6 +320,11 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) return rqdata + 1; } +static inline struct ppa_addr *nvm_rq_to_ppa_list(struct nvm_rq *rqd) +{ + return (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; +} + enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ -- cgit v1.2.3 From 7f985f9a691dc25ddcfc9dc7ff21199f5ee1569c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 9 Oct 2018 13:11:56 +0200 Subject: lightnvm: move ppa transformations to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continuing the effort of moving 1.2 and 2.0 specific code to core, move 64_to_32 and 32_to_64 ppa helpers from pblk to core. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 50ac5b21297c..eb7300c20f24 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -511,6 +511,89 @@ static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf, return caddr; } +static inline struct ppa_addr nvm_ppa32_to_ppa64(struct nvm_dev *dev, + void *addrf, u32 ppa32) +{ + struct ppa_addr ppa64; + + ppa64.ppa = 0; + + if (ppa32 == -1) { + ppa64.ppa = ADDR_EMPTY; + } else if (ppa32 & (1U << 31)) { + ppa64.c.line = ppa32 & ((~0U) >> 1); + ppa64.c.is_cached = 1; + } else { + struct nvm_geo *geo = &dev->geo; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = addrf; + + ppa64.g.ch = (ppa32 & ppaf->ch_mask) >> + ppaf->ch_offset; + ppa64.g.lun = (ppa32 & ppaf->lun_mask) >> + ppaf->lun_offset; + ppa64.g.blk = (ppa32 & ppaf->blk_mask) >> + ppaf->blk_offset; + ppa64.g.pg = (ppa32 & ppaf->pg_mask) >> + ppaf->pg_offset; + ppa64.g.pl = (ppa32 & ppaf->pln_mask) >> + ppaf->pln_offset; + ppa64.g.sec = (ppa32 & ppaf->sec_mask) >> + ppaf->sec_offset; + } else { + struct nvm_addrf *lbaf = addrf; + + ppa64.m.grp = (ppa32 & lbaf->ch_mask) >> + lbaf->ch_offset; + ppa64.m.pu = (ppa32 & lbaf->lun_mask) >> + lbaf->lun_offset; + ppa64.m.chk = (ppa32 & lbaf->chk_mask) >> + lbaf->chk_offset; + ppa64.m.sec = (ppa32 & lbaf->sec_mask) >> + lbaf->sec_offset; + } + } + + return ppa64; +} + +static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev, + void *addrf, struct ppa_addr ppa64) +{ + u32 ppa32 = 0; + + if (ppa64.ppa == ADDR_EMPTY) { + ppa32 = ~0U; + } else if (ppa64.c.is_cached) { + ppa32 |= ppa64.c.line; + ppa32 |= 1U << 31; + } else { + struct nvm_geo *geo = &dev->geo; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = addrf; + + ppa32 |= ppa64.g.ch << ppaf->ch_offset; + ppa32 |= ppa64.g.lun << ppaf->lun_offset; + ppa32 |= ppa64.g.blk << ppaf->blk_offset; + ppa32 |= ppa64.g.pg << ppaf->pg_offset; + ppa32 |= ppa64.g.pl << ppaf->pln_offset; + ppa32 |= ppa64.g.sec << ppaf->sec_offset; + } else { + struct nvm_addrf *lbaf = addrf; + + ppa32 |= ppa64.m.grp << lbaf->ch_offset; + ppa32 |= ppa64.m.pu << lbaf->lun_offset; + ppa32 |= ppa64.m.chk << lbaf->chk_offset; + ppa32 |= ppa64.m.sec << lbaf->sec_offset; + } + } + + return ppa32; +} + + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, -- cgit v1.2.3 From bf82fa2f584f1c6e12df8c04fca715d53e7f32d5 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Tue, 9 Oct 2018 13:11:59 +0200 Subject: lightnvm: pblk: fix mapping issue on failed writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 1.2-devices, the mapping-out of remaning sectors in the failed-write's block can result in an infinite loop, stalling the write pipeline, fix this. Fixes: 6a3abf5beef6 ("lightnvm: pblk: rework write error recovery path") Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index eb7300c20f24..2fdeac1a420d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -593,6 +593,42 @@ static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev, return ppa32; } +static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev, + struct ppa_addr *ppa) +{ + struct nvm_geo *geo = &dev->geo; + int last = 0; + + if (geo->version == NVM_OCSSD_SPEC_12) { + int sec = ppa->g.sec; + + sec++; + if (sec == geo->ws_min) { + int pg = ppa->g.pg; + + sec = 0; + pg++; + if (pg == geo->num_pg) { + int pl = ppa->g.pl; + + pg = 0; + pl++; + if (pl == geo->num_pln) + last = 1; + + ppa->g.pl = pl; + } + ppa->g.pg = pg; + } + ppa->g.sec = sec; + } else { + ppa->m.sec++; + if (ppa->m.sec == geo->clba) + last = 1; + } + + return last; +} typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -- cgit v1.2.3 From 4822e902f9bdffaea2817471365e000966f0d1a1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Oct 2018 10:07:06 +0300 Subject: block: describe difference between flags IO_STAT and STATS This adds reasonable comments, but they definitely needs better names. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dee46c20701b..61207560e826 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,7 +108,7 @@ typedef __u32 __bitwise req_flags_t; #define RQF_QUIET ((__force req_flags_t)(1 << 11)) /* elevator private data attached */ #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) -/* account I/O stat */ +/* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) /* request came from our alloc pool */ #define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) @@ -116,7 +116,7 @@ typedef __u32 __bitwise req_flags_t; #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* IO stats tracking on */ +/* track IO completion time */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) /* Look at ->special_vec for the actual data payload instead of the bio chain. */ @@ -685,7 +685,7 @@ struct request_queue { #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ +#define QUEUE_FLAG_IO_STAT 10 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_DISCARD 11 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 12 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 13 /* Contributes to random pool */ @@ -699,7 +699,7 @@ struct request_queue { #define QUEUE_FLAG_FUA 21 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 22 /* flush not queueuable */ #define QUEUE_FLAG_DAX 23 /* device supports DAX */ -#define QUEUE_FLAG_STATS 24 /* track rq completion times */ +#define QUEUE_FLAG_STATS 24 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ -- cgit v1.2.3 From 9316a9ed6895c4ad2f0cde171d486f80c55d8283 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Oct 2018 08:40:37 -0600 Subject: blk-mq: provide helper for setting up an SQ queue and tag set This pattern is repeated throughout all the blk-mq conversions. Provide a basic helper to get it done. Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1da59c16f637..2286dc12c6bc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -203,6 +203,10 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); +struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, + unsigned int queue_depth, + unsigned int set_flags); int blk_mq_register_dev(struct device *, struct request_queue *); void blk_mq_unregister_dev(struct device *, struct request_queue *); -- cgit v1.2.3 From 891b7c5fbf619d2a314e424775b3c232eb227e90 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Oct 2018 08:09:58 -0600 Subject: mtd_blkdevs: convert to blk-mq Straight forward conversion, using an internal list to enable the driver to pull requests at will. Dynamically allocate the tag set to avoid having to pull in the block headers for blktrans.h, since various mtd drivers use block conflicting names for defines and functions. Cc: David Woodhouse Cc: linux-mtd@lists.infradead.org Tested-by: Richard Weinberger Signed-off-by: Jens Axboe --- include/linux/mtd/blktrans.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index e93837f647de..1d3ade69d39a 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -23,7 +23,6 @@ #include #include #include -#include struct hd_geometry; struct mtd_info; @@ -44,9 +43,9 @@ struct mtd_blktrans_dev { struct kref ref; struct gendisk *disk; struct attribute_group *disk_attributes; - struct workqueue_struct *wq; - struct work_struct work; struct request_queue *rq; + struct list_head rq_list; + struct blk_mq_tag_set *tag_set; spinlock_t queue_lock; void *priv; fmode_t file_mode; -- cgit v1.2.3 From c87228f16f0acdf242dee62f139013212208473f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 11 Oct 2018 12:20:45 -0700 Subject: amiflop: fold headers into C file amifd.h and amifdreg.h are only used from amiflop.c, and they're pretty small, so move the contents to amiflop.c and get rid of the .h files. This is preparation for adding a struct blk_mq_tag_set to struct amiga_floppy_struct. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/amifd.h | 63 ------------------------------------- include/linux/amifdreg.h | 82 ------------------------------------------------ 2 files changed, 145 deletions(-) delete mode 100644 include/linux/amifd.h delete mode 100644 include/linux/amifdreg.h (limited to 'include/linux') diff --git a/include/linux/amifd.h b/include/linux/amifd.h deleted file mode 100644 index 202a77dbe46d..000000000000 --- a/include/linux/amifd.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _AMIFD_H -#define _AMIFD_H - -/* Definitions for the Amiga floppy driver */ - -#include - -#define FD_MAX_UNITS 4 /* Max. Number of drives */ -#define FLOPPY_MAX_SECTORS 22 /* Max. Number of sectors per track */ - -#ifndef ASSEMBLER - -struct fd_data_type { - char *name; /* description of data type */ - int sects; /* sectors per track */ -#ifdef __STDC__ - int (*read_fkt)(int); - void (*write_fkt)(int); -#else - int (*read_fkt)(); /* read whole track */ - void (*write_fkt)(); /* write whole track */ -#endif -}; - -/* -** Floppy type descriptions -*/ - -struct fd_drive_type { - unsigned long code; /* code returned from drive */ - char *name; /* description of drive */ - unsigned int tracks; /* number of tracks */ - unsigned int heads; /* number of heads */ - unsigned int read_size; /* raw read size for one track */ - unsigned int write_size; /* raw write size for one track */ - unsigned int sect_mult; /* sectors and gap multiplier (HD = 2) */ - unsigned int precomp1; /* start track for precomp 1 */ - unsigned int precomp2; /* start track for precomp 2 */ - unsigned int step_delay; /* time (in ms) for delay after step */ - unsigned int settle_time; /* time to settle after dir change */ - unsigned int side_time; /* time needed to change sides */ -}; - -struct amiga_floppy_struct { - struct fd_drive_type *type; /* type of floppy for this unit */ - struct fd_data_type *dtype; /* type of floppy for this unit */ - int track; /* current track (-1 == unknown) */ - unsigned char *trackbuf; /* current track (kmaloc()'d */ - - int blocks; /* total # blocks on disk */ - - int changed; /* true when not known */ - int disk; /* disk in drive (-1 == unknown) */ - int motor; /* true when motor is at speed */ - int busy; /* true when drive is active */ - int dirty; /* true when trackbuf is not on disk */ - int status; /* current error code for unit */ - struct gendisk *gendisk; -}; -#endif - -#endif diff --git a/include/linux/amifdreg.h b/include/linux/amifdreg.h deleted file mode 100644 index 9b514d05ec70..000000000000 --- a/include/linux/amifdreg.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_AMIFDREG_H -#define _LINUX_AMIFDREG_H - -/* -** CIAAPRA bits (read only) -*/ - -#define DSKRDY (0x1<<5) /* disk ready when low */ -#define DSKTRACK0 (0x1<<4) /* head at track zero when low */ -#define DSKPROT (0x1<<3) /* disk protected when low */ -#define DSKCHANGE (0x1<<2) /* low when disk removed */ - -/* -** CIAAPRB bits (read/write) -*/ - -#define DSKMOTOR (0x1<<7) /* motor on when low */ -#define DSKSEL3 (0x1<<6) /* select drive 3 when low */ -#define DSKSEL2 (0x1<<5) /* select drive 2 when low */ -#define DSKSEL1 (0x1<<4) /* select drive 1 when low */ -#define DSKSEL0 (0x1<<3) /* select drive 0 when low */ -#define DSKSIDE (0x1<<2) /* side selection: 0 = upper, 1 = lower */ -#define DSKDIREC (0x1<<1) /* step direction: 0=in, 1=out (to trk 0) */ -#define DSKSTEP (0x1) /* pulse low to step head 1 track */ - -/* -** DSKBYTR bits (read only) -*/ - -#define DSKBYT (1<<15) /* register contains valid byte when set */ -#define DMAON (1<<14) /* disk DMA enabled */ -#define DISKWRITE (1<<13) /* disk write bit in DSKLEN enabled */ -#define WORDEQUAL (1<<12) /* DSKSYNC register match when true */ -/* bits 7-0 are data */ - -/* -** ADKCON/ADKCONR bits -*/ - -#ifndef SETCLR -#define ADK_SETCLR (1<<15) /* control bit */ -#endif -#define ADK_PRECOMP1 (1<<14) /* precompensation selection */ -#define ADK_PRECOMP0 (1<<13) /* 00=none, 01=140ns, 10=280ns, 11=500ns */ -#define ADK_MFMPREC (1<<12) /* 0=GCR precomp., 1=MFM precomp. */ -#define ADK_WORDSYNC (1<<10) /* enable DSKSYNC auto DMA */ -#define ADK_MSBSYNC (1<<9) /* when 1, enable sync on MSbit (for GCR) */ -#define ADK_FAST (1<<8) /* bit cell: 0=2us (GCR), 1=1us (MFM) */ - -/* -** DSKLEN bits -*/ - -#define DSKLEN_DMAEN (1<<15) -#define DSKLEN_WRITE (1<<14) - -/* -** INTENA/INTREQ bits -*/ - -#define DSKINDEX (0x1<<4) /* DSKINDEX bit */ - -/* -** Misc -*/ - -#define MFM_SYNC 0x4489 /* standard MFM sync value */ - -/* Values for FD_COMMAND */ -#define FD_RECALIBRATE 0x07 /* move to track 0 */ -#define FD_SEEK 0x0F /* seek track */ -#define FD_READ 0xE6 /* read with MT, MFM, SKip deleted */ -#define FD_WRITE 0xC5 /* write with MT, MFM */ -#define FD_SENSEI 0x08 /* Sense Interrupt Status */ -#define FD_SPECIFY 0x03 /* specify HUT etc */ -#define FD_FORMAT 0x4D /* format one track */ -#define FD_VERSION 0x10 /* get version code */ -#define FD_CONFIGURE 0x13 /* configure FIFO operation */ -#define FD_PERPENDICULAR 0x12 /* perpendicular r/w mode */ - -#endif /* _LINUX_AMIFDREG_H */ -- cgit v1.2.3 From b2c3fa546705944e748666b474ffdaebaec0569f Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Sat, 20 Oct 2018 14:56:11 -0400 Subject: blkcg: fix edge case for blk_get_rl() under memory pressure It is possible for blkg creation to fail when in blk_get_rl(). In this situation, the fallback logic returns the nearest created blkg. There is however special handling for the request_list for the root blkcg. This fixes the missing edge case from the earlier series changing blk_get_rl(). Fixes: e2b0989954ae ("blkcg: cleanup and make blk_get_rl use blkg_lookup_create") Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b7fd08013de2..1e76ceebeb5d 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -597,7 +597,7 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, if (unlikely(!blkg)) blkg = __blkg_lookup_create(blkcg, q); - if (!blkg_tryget(blkg)) + if (blkg->blkcg == &blkcg_root || !blkg_tryget(blkg)) goto rl_use_root; rcu_read_unlock(); -- cgit v1.2.3 From d459d853c2edc793135e4bfa4e345c758f1cc859 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Sat, 20 Oct 2018 14:56:12 -0400 Subject: blkcg: reassociate bios when make_request() is called recursively When submitting a bio, multiple recursive calls to make_request() may occur. This causes the initial associate done in blkcg_bio_issue_check() to be incorrect and reference the prior request_queue. This introduces a helper to do reassociation when make_request() is recursively called. Fixes: a7b39b4e961c ("blkcg: always associate a bio with a blkg") Reported-by: Valdis Kletnieks Signed-off-by: Dennis Zhou Tested-by: Valdis Kletnieks Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index f447b0ebb288..b47c7f716731 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -514,6 +514,7 @@ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); int bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); +int bio_reassociate_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ @@ -522,6 +523,8 @@ static inline int bio_associate_blkg_from_css(struct bio *bio, { return 0; } static inline int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) { return 0; } +static inline int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) +{ return 0; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } -- cgit v1.2.3