From 6b1d83d274486615cc341e410467a98fd9c27c0a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:19 -0700 Subject: block: Remove bdev_nr_zones() Remove this function since it has no callers. This function was introduced in commit 6cc77e9cb080 ("block: introduce zoned block devices zone write locking"). Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Cc: Matias Bjorling Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 79226ca8f80f..49a400afb146 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1639,15 +1639,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return blk_queue_nr_zones(q); - return 0; -} - static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 7c8542b7982264226cf94102950343185869b584 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:20 -0700 Subject: block: Inline blk_queue_nr_zones() Since the implementation of blk_queue_nr_zones() is trivial and since it only has a single caller, inline this function. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49a400afb146..905daa7c647e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -800,11 +800,6 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -static inline unsigned int blk_queue_nr_zones(struct request_queue *q) -{ - return q->nr_zones; -} - static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { -- cgit v1.2.3 From 6a5ac9846508ad7d1d23881d9d5add35f2e6ae71 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:21 -0700 Subject: block: Make struct request_queue smaller for CONFIG_BLK_DEV_ZONED=n Exclude zoned block device members from struct request_queue for CONFIG_BLK_DEV_ZONED == n. Avoid breaking the build by only building the code that uses these struct request_queue members if CONFIG_BLK_DEV_ZONED != n. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 905daa7c647e..ca5a8b046894 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -592,6 +592,7 @@ struct request_queue { struct queue_limits limits; +#ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always @@ -612,6 +613,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *seq_zones_bitmap; unsigned long *seq_zones_wlock; +#endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff @@ -800,6 +802,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { @@ -815,6 +818,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return false; return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) { @@ -1065,6 +1069,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); @@ -1074,6 +1079,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } +#endif /* CONFIG_BLK_DEV_ZONED */ /* * Some commands like WRITE SAME have a payload or data transfer size which -- cgit v1.2.3 From 5815839b3ca16bb1d45939270871169f6803a121 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:47 +0800 Subject: blk-mq: introduce new lock for protecting hctx->dispatch_wait Now hctx->lock is only acquired when adding hctx->dispatch_wait to one wait queue, but not held when removing it from the wait queue. IO hang can be observed easily if SCHED RESTART is disabled, that means now RESTART exits just for fixing the issue in blk_mq_mark_tag_wait(). This patch fixes the issue by introducing hctx->dispatch_wait_lock and holding it for removing hctx->dispatch_wait in blk_mq_dispatch_wake(), since we need to avoid acquiring hctx->lock in irq context. Fixes: eb619fdb2d4cb8b3d3419 ("blk-mq: fix issue with shared tag queue re-running") Cc: Christoph Hellwig Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222..ea690254dab7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,6 +39,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; atomic_t wait_index; -- cgit v1.2.3 From 97889f9ac24f8d2fc8e703ea7f80c162bab10d4d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:48 +0800 Subject: blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set() We have to remove synchronize_rcu() from blk_queue_cleanup(), otherwise long delay can be caused during lun probe. For removing it, we have to avoid to iterate the set->tag_list in IO path, eg, blk_mq_sched_restart(). This patch reverts 5b79413946d (Revert "blk-mq: don't handle TAG_SHARED in restart"). Given we have fixed enough IO hang issue, and there isn't any reason to restart all queues in one tags any more, see the following reasons: 1) blk-mq core can deal with shared-tags case well via blk_mq_get_driver_tag(), which can wake up queues waiting for driver tag. 2) SCSI is a bit special because it may return BLK_STS_RESOURCE if queue, target or host is ready, but SCSI built-in restart can cover all these well, see scsi_end_request(), queue will be rerun after any request initiated from this host/target is completed. In my test on scsi_debug(8 luns), this patch may improve IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") Cc: Omar Sandoval Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Martin K. Petersen Cc: linux-scsi@vger.kernel.org Reported-by: Andrew Jones Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca5a8b046894..9d05646d5059 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,8 +442,6 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ - atomic_t shared_hctx_restart; - struct blk_queue_stats *stats; struct rq_wb *rq_wb; -- cgit v1.2.3 From 6e768717304bdbe8d2897ca8298f6b58863fdc41 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 3 Jul 2018 09:03:16 -0600 Subject: blk-mq: dequeue request one by one from sw queue if hctx is busy It won't be efficient to dequeue request one by one from sw queue, but we have to do that when queue is busy for better merge performance. This patch takes the Exponential Weighted Moving Average(EWMA) to figure out if queue is busy, then only dequeue request one by one from sw queue when queue is busy. Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue") Cc: Kashyap Desai Cc: Laurence Oberman Cc: Omar Sandoval Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Reported-by: Kashyap Desai Tested-by: Kashyap Desai Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ea690254dab7..d710e92874cc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,9 +35,10 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; struct blk_mq_ctx *dispatch_from; + unsigned int dispatch_busy; - struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + struct blk_mq_ctx **ctxs; spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; -- cgit v1.2.3 From 08e18eab0c579ad84399c1899c11899734854eb2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:50 -0400 Subject: block: add bi_blkg to the bio for cgroups Currently io.low uses a bi_cg_private to stash its private data for the blkg, however other blkcg policies may want to use this as well. Since we can get the private data out of the blkg, move this to bi_blkg in the bio and make it generic, then we can use bio_associate_blkg() to attach the blkg to the bio. Theoretically we could simply replace the bi_css with this since we can get to all the same information from the blkg, however you have to lookup the blkg, so for example wbc_init_bio() would have to lookup and possibly allocate the blkg for the css it was trying to attach to the bio. This could be problematic and result in us either not attaching the css at all to the bio, or falling back to the root blkcg if we are unable to allocate the corresponding blkg. So for now do this, and in the future if possible we could just replace the bi_css with bi_blkg and update the helpers to do the correct translation. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + include/linux/blk_types.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index f08f5fe7bd08..a279ba384da9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -555,6 +555,7 @@ do { \ #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3c4f390aea4b..3364d42ebe08 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -179,8 +179,8 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; + struct blkcg_gq *bi_blkg; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - void *bi_cg_private; struct bio_issue bi_issue; #endif #endif -- cgit v1.2.3 From c7c98fd37653955d3a17dd4f1fa67aba070096a9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:51 -0400 Subject: block: introduce bio_issue_as_root_blkg Instead of forcing all file systems to get the right context on their bio's, simply check for REQ_META to see if we need to issue as the root blkg. We don't want to force all bio's to have the root blkg associated with them if REQ_META is set, as some controllers (blk-iolatency) need to know who the originating cgroup is so it can backcharge them for the work they are doing. This helper will make sure that the controllers do the proper thing wrt the IO priority and backcharging. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6c666fd7de3c..69aa71dc0c04 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -238,6 +238,22 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return css_to_blkcg(task_css(current, io_cgrp_id)); } +/** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg + * @return: true if this bio needs to be submitted with the root blkg context. + * + * In order to avoid priority inversions we sometimes need to issue a bio as if + * it were attached to the root blkg, and then backcharge to the actual owning + * blkg. The idea is we do bio_blkcg() to look up the actual context for the + * bio and attach the appropriate blkg to the bio. Then we call this helper and + * if it is true run with the root blkg for that queue and then do any + * backcharging to the originating cgroup once the io is complete. + */ +static inline bool bio_issue_as_root_blkg(struct bio *bio) +{ + return (bio->bi_opf & REQ_META); +} + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest -- cgit v1.2.3 From 903d23f0a354f226fa78f1c1c34b60aaf992e812 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:52 -0400 Subject: blk-cgroup: allow controllers to output their own stats blk-iolatency has a few stats that it would like to print out, and instead of adding a bunch of crap to the generic code just provide a helper so that controllers can add stuff to the stat line if they want to. Hide it behind a boot option since it changes the output of io.stat from normal, and these stats are only interesting to developers. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 69aa71dc0c04..b41292726c0f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,6 +148,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, + size_t size); struct blkcg_policy { int plid; @@ -167,6 +169,7 @@ struct blkcg_policy { blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; -- cgit v1.2.3 From 0d1e0c7cd5909d6c6aa0957179318e13fcca971a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:53 -0400 Subject: blk: introduce REQ_SWAP Just like REQ_META, it's important to know the IO coming down is swap in order to guard against potential IO priority inversion issues with cgroups. Add REQ_SWAP and use it for all swap IO, and add it to our bio_issue_as_root_blkg helper. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- include/linux/blk_types.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b41292726c0f..a8f9ba8f33a4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -254,7 +254,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { - return (bio->bi_opf & REQ_META); + return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3364d42ebe08..0ffc34c5cc83 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,7 +329,7 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, - + __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; @@ -351,6 +351,7 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_DRV (1ULL << __REQ_DRV) +#define REQ_SWAP (1ULL << __REQ_SWAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From 0d3bd88d54f513723602b361dccfc71639f50779 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Jul 2018 11:14:54 -0400 Subject: swap,blkcg: issue swap io with the appropriate context For backcharging we need to know who the page belongs to when swapping it out. We don't worry about things that do ->rw_page (zram etc) at the moment, we're only worried about pages that actually go to a block device. Signed-off-by: Tejun Heo Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/bio.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a279ba384da9..a00dfff51aa5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -553,6 +553,13 @@ do { \ #define bio_dev(bio) \ disk_devt((bio)->bi_disk) +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +#else +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } +#endif + #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); -- cgit v1.2.3 From d09d8df3a29403693d9d20cc34ed101f2c558e2b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:55 -0400 Subject: blkcg: add generic throttling mechanism Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 99 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup-defs.h | 3 ++ include/linux/sched.h | 8 ++++ include/linux/tracehook.h | 2 + 4 files changed, 112 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a8f9ba8f33a4..de57de4831d5 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -136,6 +136,12 @@ struct blkcg_gq { struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); @@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return css_to_blkcg(task_css(current, io_cgrp_id)); } +static inline bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (!css) + css = task_css(current, io_cgrp_id); + while (css) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + css = css->parent; + } + rcu_read_unlock(); + return ret; +} + /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. @@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg) atomic_inc(&blkg->refcnt); } +/** + * blkg_try_get - try and get a blkg reference + * @blkg: blkg to get + * + * This is for use when doing an RCU lookup of the blkg. We may be in the midst + * of freeing this blkg, so we can only use it if the refcnt is not zero. + */ +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +{ + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; +} + + void __blkg_release_rcu(struct rcu_head *rcu); /** @@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, return !throtl; } +static inline void blkcg_use_delay(struct blkcg_gq *blkg) +{ + if (atomic_add_return(1, &blkg->use_delay) == 1) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); +} + +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + if (old == 0) + return 0; + + /* + * We do this song and dance because we can race with somebody else + * adding or removing delay. If we just did an atomic_dec we'd end up + * negative and we'd already be in trouble. We need to subtract 1 and + * then check to see if we were the last delay so we can drop the + * congestion count on the cgroup. + */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); + if (cur == old) + break; + old = cur; + } + + if (old == 0) + return 0; + if (old == 1) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + return 1; +} + +static inline void blkcg_clear_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + if (!old) + return; + /* We only want 1 person clearing the congestion count for this blkg. */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); + if (cur == old) { + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + break; + } + old = cur; + } +} + +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_maybe_throttle_current(void); #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -753,8 +847,13 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) +static inline void blkcg_maybe_throttle_current(void) { } +static inline bool blk_cgroup_congested(void) { return false; } + #ifdef CONFIG_BLOCK +static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c0e68f903011..ff20b677fb9f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -438,6 +438,9 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; + /* If there is block congestion on this cgroup. */ + atomic_t congestion_count; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c97..c2e993de67ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,10 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; #endif +#ifdef CONFIG_BLK_CGROUP + /* to be used once the psi infrastructure lands upstream. */ + unsigned use_memdelay:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -1151,6 +1155,10 @@ struct task_struct { unsigned int memcg_nr_pages_over_high; #endif +#ifdef CONFIG_BLK_CGROUP + struct request_queue *throttle_queue; +#endif + #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 4a8841963c2e..05589a3e37f4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -51,6 +51,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) task_work_run(); mem_cgroup_handle_over_high(); + blkcg_maybe_throttle_current(); } #endif /* */ -- cgit v1.2.3 From 2cf855837b89d92996cf264713f3bed2bf9b0b4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Jul 2018 11:14:56 -0400 Subject: memcontrol: schedule throttling if we are congested Memory allocations can induce swapping via kswapd or direct reclaim. If we are having IO done for us by kswapd and don't actually go into direct reclaim we may never get scheduled for throttling. So instead check to see if our cgroup is congested, and if so schedule the throttling. Before we return to user space the throttling stuff will only throttle if we actually required it. Signed-off-by: Tejun Heo Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 13 +++++++++++++ include/linux/swap.h | 11 ++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c6fb116e925..680d3395fc83 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, bool compound); +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, @@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_try_charge_delay(struct page *page, + struct mm_struct *mm, + gfp_t gfp_mask, + struct mem_cgroup **memcgp, + bool compound) +{ + *memcgp = NULL; + return 0; +} + static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound) diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d8638..1a8bd05a335e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } - #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif +#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask); +#else +static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, + int node, gfp_t gfp_mask) +{ +} +#endif + #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -- cgit v1.2.3 From a79050434b45959f397042080fd1d70ffa9bd9df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 09:32:35 -0600 Subject: blk-rq-qos: refactor out common elements of blk-wbt blkcg-qos is going to do essentially what wbt does, only on a cgroup basis. Break out the common code that will be shared between blkcg-qos and wbt into blk-rq-qos.* so they can both utilize the same infrastructure. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9d05646d5059..137759862f07 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; -struct rq_wb; +struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; @@ -443,7 +443,7 @@ struct request_queue { int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ struct blk_queue_stats *stats; - struct rq_wb *rq_wb; + struct rq_qos *rq_qos; /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg -- cgit v1.2.3 From d70675121546c35feaceebf7ed9caed8716640f3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:15:01 -0400 Subject: block: introduce blk-iolatency io controller Current IO controllers for the block layer are less than ideal for our use case. The io.max controller is great at hard limiting, but it is not work conserving. This patch introduces io.latency. You provide a latency target for your group and we monitor the io in short windows to make sure we are not exceeding those latency targets. This makes use of the rq-qos infrastructure and works much like the wbt stuff. There are a few differences from wbt - It's bio based, so the latency covers the whole block layer in addition to the actual io. - We will throttle all IO types that comes in here if we need to. - We use the mean latency over the 100ms window. This is because writes can be particularly fast, which could give us a false sense of the impact of other workloads on our protected workload. - By default there's no throttling, we set the queue_depth to INT_MAX so that we can have as many outstanding bio's as we're allowed to. Only at throttle time do we pay attention to the actual queue depth. - We backcharge cgroups for root cg issued IO and induce artificial delays in order to deal with cases like metadata only or swap heavy workloads. In testing this has worked out relatively well. Protected workloads will throttle noisy workloads down to 1 io at time if they are doing normal IO on their own, or induce up to a 1 second delay per syscall if they are doing a lot of root issued IO (metadata/swap IO). Our testing has revolved mostly around our production web servers where we have hhvm (the web server application) in a protected group and everything else in another group. We see slightly higher requests per second (RPS) on the test tier vs the control tier, and much more stable RPS across all machines in the test tier vs the control tier. Another test we run is a slow memory allocator in the unprotected group. Before this would eventually push us into swap and cause the whole box to die and not recover at all. With these patches we see slight RPS drops (usually 10-15%) before the memory consumer is properly killed and things recover within seconds. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0ffc34c5cc83..e13449a379a1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -180,9 +180,7 @@ struct bio { struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW struct bio_issue bi_issue; -#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) -- cgit v1.2.3 From 05814a10370b3252fe2b0898b6adac3cdd531096 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Fri, 13 Jul 2018 17:07:26 +0300 Subject: block: remove blkdev_entry_to_request() macro Remove blkdev_entry_to_request() macro, which remained unused through the observable history, also note that it repeats list_entry_rq() macro verbatim. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 137759862f07..1939ed95f936 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1436,8 +1436,6 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; -- cgit v1.2.3 From 3f289dcb4b265416a57ca79cf4a324060bb09060 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Jul 2018 04:47:36 -0700 Subject: block: make bdev_ops->rw_page() take a REQ_OP instead of bool c11f0c0b5bb9 ("block/mm: make bdev_ops->rw_page() take a bool for read/write") replaced @op with boolean @is_write, which limited the amount of information going into ->rw_page() and more importantly page_endio(), which removed the need to expose block internals to mm. Unfortunately, we want to track discards separately and @is_write isn't enough information. This patch updates bdev_ops->rw_page() to take REQ_OP instead but leaves page_endio() to take bool @is_write. This allows the block part of operations to have enough information while not leaking it to mm. Signed-off-by: Tejun Heo Cc: Mike Christie Cc: Minchan Kim Cc: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1939ed95f936..331a6cb8805f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1943,7 +1943,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, bool); + int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, -- cgit v1.2.3 From 59767fbd49d794b4499d30b314df6c0d4aca584b Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:37 -0700 Subject: block: Add part_stat_read_accum to read across field entries. Add a part_stat_read_accum macro to genhd.h to read and sum across field entries. For example to sum up the number read and write sectors completed. In addition to being ar reasonable cleanup by itself this will make it easier to add new stat fields in the future. tj: Refreshed on top of v4.17. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6cb8a5789668..19f36fa10995 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -353,6 +353,10 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[0]) + \ + part_stat_read(part, field[1])) + #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ if ((part)->partno) \ -- cgit v1.2.3 From dbae2c551377b6533a00c11fc7ede370100ab404 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:38 -0700 Subject: block: Define and use STAT_READ and STAT_WRITE Add defines for STAT_READ and STAT_WRITE for indexing the partition stat entries. This clarifies some fs/ code which has hardcoded 1 for STAT_WRITE and will make it easier to extend the stats with additional fields. tj: Refreshed on top of v4.17. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: "Theodore Ts'o" Cc: Jaegeuk Kim Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 +++++++ include/linux/genhd.h | 13 +++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e13449a379a1..d2b44de56bc1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -357,6 +357,13 @@ enum req_flag_bits { #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) +enum stat_group { + STAT_READ, + STAT_WRITE, + + NR_STAT_GROUPS +}; + #define bio_op(bio) \ ((bio)->bi_opf & REQ_OP_MASK) #define req_op(req) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 19f36fa10995..a75445446974 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK @@ -82,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { - unsigned long sectors[2]; /* READs and WRITEs */ - unsigned long ios[2]; - unsigned long merges[2]; - unsigned long ticks[2]; + unsigned long sectors[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long merges[NR_STAT_GROUPS]; + unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,8 +355,8 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ #define part_stat_read_accum(part, field) \ - (part_stat_read(part, field[0]) + \ - part_stat_read(part, field[1])) + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE])) #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ -- cgit v1.2.3 From ddcf35d397976421a4ec1d0d00fbcc027a8cb034 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:39 -0700 Subject: block: Add and use op_stat_group() for indexing disk_stat fields. Add and use a new op_stat_group() function for indexing partition stat fields rather than indexing them by rq_data_dir() or bio_data_dir(). This function works similarly to op_is_sync() in that it takes the request::cmd_flags or bio::bi_opf flags and determines which stats should et updated. In addition, the second parameter to generic_start_io_acct() and generic_end_io_acct() is now a REQ_OP rather than simply a read or write bit and it uses op_stat_group() on the parameter to determine the stat group. Note that the partition in_flight counts are not part of the per-cpu statistics and as such are not indexed via this function. It's now indexed by op_is_write(). tj: Refreshed on top of v4.17. Updated to pass around REQ_OP. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Minchan Kim Cc: Dan Williams Cc: Joshua Morris Cc: Philipp Reisner Cc: Matias Bjorling Cc: Kent Overstreet Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- include/linux/blk_types.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a00dfff51aa5..ab221c517f4e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -496,9 +496,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d2b44de56bc1..2960a96d833c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -401,6 +401,11 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline int op_stat_group(unsigned int op) +{ + return op_is_write(op); +} + typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 -- cgit v1.2.3 From bdca3c87fb7ad1cc61d231d37eb0d8f90d001e0c Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:40 -0700 Subject: block: Track DISCARD statistics and output them in stat and diskstat Add tracking of REQ_OP_DISCARD ios to the partition statistics and append them to the various stat files in /sys as well as /proc/diskstats. These are tracked with the same four stats as reads and writes: Number of discard ios completed. Number of discard ios merged Number of discard sectors completed Milliseconds spent on discard requests This is done via adding a new STAT_DISCARD define to genhd.h and then using it to index that stat field for discard requests. tj: Refreshed on top of v4.17 and other previous updates. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 ++++++++ include/linux/genhd.h | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2960a96d833c..f6dfb30737d8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -360,6 +360,7 @@ enum req_flag_bits { enum stat_group { STAT_READ, STAT_WRITE, + STAT_DISCARD, NR_STAT_GROUPS }; @@ -401,8 +402,15 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline bool op_is_discard(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_DISCARD; +} + static inline int op_stat_group(unsigned int op) { + if (op_is_discard(op)) + return STAT_DISCARD; return op_is_write(op); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a75445446974..57864422a2c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -356,7 +356,8 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ - part_stat_read(part, field[STAT_WRITE])) + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ -- cgit v1.2.3 From 636620b66d5d4012c4a9c86206013964d3986c4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Jul 2018 04:47:41 -0700 Subject: blkcg: Track DISCARD statistics and output them in cgroup io.stat Add tracking of REQ_OP_DISCARD ios to the per-cgroup io.stat. Two fields, dbytes and dios, to respectively count the total bytes and number of discards are added. Signed-off-by: Tejun Heo Cc: Andy Newell Cc: Michael Callahan Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index de57de4831d5..3bed5e02a873 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -35,6 +35,7 @@ enum blkg_rwstat_type { BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, @@ -649,7 +650,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, { struct percpu_counter *cnt; - if (op_is_write(op)) + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; -- cgit v1.2.3 From 40c6f9c28ef03f2f2c3ee58c2447a6e6b9a713f2 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Fri, 15 Jun 2018 12:39:27 -0600 Subject: nvme.h: resync with nvme-cli Added some feature ids present in nvme-cli but not kernel. Signed-off-by: Revanth Rajashekar Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2950ce957656..80dfedcf0bf7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -749,6 +749,11 @@ enum { NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0x10, + NVME_FEAT_NOPSC = 0x11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, -- cgit v1.2.3 From 071f52fbce6161706d070ceada5accb81630bf02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:32 +0200 Subject: block: remove bio_clone_kmalloc Unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ab221c517f4e..b861baa59454 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,12 +443,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); } -static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, NULL); - -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); -- cgit v1.2.3 From c55183c9aaa00d2bbb578169a480e31aff3d397c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:34 +0200 Subject: block: unexport bio_clone_bioset Now only used by the bounce code, so move it there and mark the function static. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b861baa59454..51371740d2a8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -429,7 +429,6 @@ extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); -extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set fs_bio_set; -- cgit v1.2.3 From 359f642700f2ff05d9c94cd9216c97af7b8e9553 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Wed, 25 Jul 2018 10:22:58 -0400 Subject: block: move bio_integrity_{intervals,bytes} into blkdev.h This allows bio_integrity_bytes() to be called from drivers instead of open coding it. Acked-by: Martin K. Petersen Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 331a6cb8805f..050d599f5ea9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1865,6 +1865,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; @@ -1938,6 +1960,18 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return false; } +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ struct block_device_operations { -- cgit v1.2.3 From 9b89bc3857a6c0dfda18ddae2a42c114ecc32753 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 12 May 2018 18:18:12 +0200 Subject: nvme.h: add support for the log specific field NVMe 1.3 added a new log specific field to the get log page CQ defintion, add it to our get_log_page SQ structure. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 80dfedcf0bf7..39f05b0b8c7f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -885,7 +885,7 @@ struct nvme_get_log_page_command { __u64 rsvd2[2]; union nvme_data_ptr dptr; __u8 lid; - __u8 rsvd10; + __u8 lsp; /* upper 4 bits reserved */ __le16 numdl; __le16 numdu; __u16 rsvd11; -- cgit v1.2.3 From 1a37621658fe06b10cf8bac02c32304d2a1c888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 May 2018 18:53:57 +0200 Subject: nvme.h: add ANA definitions Add various defintions from NVMe 1.3 TP 4004. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 39f05b0b8c7f..64c9175723de 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -242,7 +242,12 @@ struct nvme_id_ctrl { __le32 sanicap; __le32 hmminds; __le16 hmmaxd; - __u8 rsvd338[174]; + __u8 rsvd338[4]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -258,7 +263,8 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[228]; + __le32 mnan; + __u8 rsvd544[224]; char subnqn[256]; __u8 rsvd1024[768]; __le32 ioccsz; @@ -312,7 +318,9 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[40]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[8]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -425,6 +433,32 @@ struct nvme_effects_log { __u8 resv[2048]; }; +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[7]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -444,11 +478,13 @@ enum { enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, + NVME_AER_NOTICE_ANA = 0x03, }; enum { NVME_AEN_CFG_NS_ATTR = 1 << 8, NVME_AEN_CFG_FW_ACT = 1 << 9, + NVME_AEN_CFG_ANA_CHANGE = 1 << 11, }; struct nvme_lba_range_type { @@ -763,6 +799,7 @@ enum { NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1185,6 +1222,13 @@ enum { NVME_SC_ACCESS_DENIED = 0x286, NVME_SC_UNWRITTEN_BLOCK = 0x287, + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_DNR = 0x4000, }; -- cgit v1.2.3 From c454edc21b12dd7d416de6c81555e87aaec9685c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Jul 2018 10:10:01 -0400 Subject: block: don't account for split bio's size in cgroup stats We need to check in blkcg_bio_issue_check if the bio is flagged as QUEUE_ENTERED, because if it is then we've already accounted for the size of the IO in the cgroup stats. We can still however account for the extra IO since it'll be another request. Reported-by: Tejun Heo Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bed5e02a873..f7b910768306 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -769,8 +769,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); + /* + * If the bio is flagged with BIO_QUEUE_ENTERED it means this + * is a split bio and we would have already accounted for the + * size of the bio. + */ + if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } -- cgit v1.2.3 From ddd0bc756983dc4d19000a4fe021b4c7f9d59aab Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 30 Jul 2018 00:15:31 +0300 Subject: block: move ref_tag calculation func to the block layer Currently this function is implemented in the scsi layer, but it's actual place should be the block layer since T10-PI is a general data integrity feature that is used in the nvme protocol as well. Suggested-by: Christoph Hellwig Cc: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c6aa8a3c42ed..c40511f4e63d 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -37,6 +37,16 @@ struct t10_pi_tuple { #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) +static inline u32 t10_pi_ref_tag(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_INTEGRITY + return blk_rq_pos(rq) >> + (rq->q->integrity.interval_exp - 9) & 0xffffffff; +#else + return -1U; +#endif +} + extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; -- cgit v1.2.3 From 10c41ddd61323b27b447bc8e18296ac6c06107ad Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 30 Jul 2018 00:15:32 +0300 Subject: block: move dif_prepare/dif_complete functions to block layer Currently these functions are implemented in the scsi layer, but their actual place should be the block layer since T10-PI is a general data integrity feature that is used in the nvme protocol as well. Also, use the tuple size from the integrity profile since it may vary between integrity types. Suggested-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c40511f4e63d..5a427c289f58 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -51,5 +51,8 @@ extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; +extern void t10_pi_prepare(struct request *rq, u8 protection_type); +extern void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals); #endif -- cgit v1.2.3 From 08fcf813281ebcf72c69487c1501ad91b7121cdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jul 2018 09:10:26 -0600 Subject: t10-pi: provide empty t10_pi_complete() for !CONFIG_BLK_DEV_INTEGRITY Fixes a link failure whtn BLK_DEV_INTEGRITY isn't defined. Fixes: 10c41ddd6132 ("block: move dif_prepare/dif_complete functions to block layer") Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 5a427c289f58..b9626aa7e90c 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -51,8 +51,19 @@ extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; + +#ifdef CONFIG_BLK_DEV_INTEGRITY extern void t10_pi_prepare(struct request *rq, u8 protection_type); extern void t10_pi_complete(struct request *rq, u8 protection_type, unsigned int intervals); +#else +static inline void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals) +{ +} +static inline void t10_pi_prepare(struct request *rq, u8 protection_type) +{ +} +#endif #endif -- cgit v1.2.3 From e7d0748dd71695b94f3a35c8bdc05226a7f3d919 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Aug 2018 15:22:13 -0600 Subject: block: Switch struct packet_command to use struct scsi_sense_hdr There is a lot of needless struct request_sense usage in the CDROM code. These can all be struct scsi_sense_hdr instead, to avoid any confusion over their respective structure sizes. This patch is a lot of noise changing "sense" to "sshdr", but the final code is more readable to distinguish between "sense" meaning "struct request_sense" and "sshdr" meaning "struct scsi_sense_hdr". Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index e75dfd1f1dec..528271c60018 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -13,6 +13,7 @@ #include /* not really needed, later.. */ #include +#include #include struct packet_command @@ -21,7 +22,7 @@ struct packet_command unsigned char *buffer; unsigned int buflen; int stat; - struct request_sense *sense; + struct scsi_sense_hdr *sshdr; unsigned char data_direction; int quiet; int timeout; -- cgit v1.2.3 From 8b92d0e3d400390660a26ef7f475524700fb86cf Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 8 Aug 2018 08:35:29 +0200 Subject: nvme.h: fixup ANA group descriptor format ANA Phase 3 draft had the 'reserved' field in the group descriptor format set to '23:17' (so that the first namespace identifier started at byte 24), but that got move with the approved TP to '31:17' (so that the first namespace identifier started at byte 32). Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 64c9175723de..a661861e9d56 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -446,7 +446,7 @@ struct nvme_ana_group_desc { __le32 nnsids; __le64 chgcnt; __u8 state; - __u8 rsvd17[7]; + __u8 rsvd17[15]; __le32 nsids[]; }; -- cgit v1.2.3 From 93045d5942da60801e71764597d448cf37a798c1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 7 Aug 2018 23:01:05 -0700 Subject: nvme.h: add support for ns write protect definitions Add various definitions from NVMe 1.3 TP 4005. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a661861e9d56..68e91ef5494c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -259,7 +259,7 @@ struct nvme_id_ctrl { __le16 awun; __le16 awupf; __u8 nvscc; - __u8 rsvd531; + __u8 nwpc; __le16 acwu; __u8 rsvd534[2]; __le32 sgls; @@ -320,7 +320,9 @@ struct nvme_id_ns { __u8 nvmcap[16]; __u8 rsvd64[28]; __le32 anagrpid; - __u8 rsvd96[8]; + __u8 rsvd96[3]; + __u8 nsattr; + __u8 rsvd100[4]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -794,6 +796,7 @@ enum { NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, @@ -807,6 +810,14 @@ enum { NVME_FWACT_ACTV = (2 << 3), }; +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + #define NVME_MAX_CHANGED_NAMESPACES 1024 struct nvme_identify { @@ -1153,6 +1164,8 @@ enum { NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_NS_WRITE_PROTECTED = 0x20, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, -- cgit v1.2.3 From b1f4267cc5448d20ae0c515a74141e74365e78a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:47:28 -0700 Subject: block: Remove two superfluous #include directives Commit 12f5b9314545 ("blk-mq: Remove generation seqeunce") removed the only seqcount_t and u64_stats_sync instances from but did not remove the corresponding #include directives. Since these include directives are no longer needed, remove them. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Keith Busch Cc: Ming Lei Cc: Jianchao Wang Cc: Hannes Reinecke , Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 050d599f5ea9..d6869e0e2b64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,8 +27,6 @@ #include #include #include -#include -#include struct module; struct scsi_ioctl_command; -- cgit v1.2.3 From 6bad9b210a228d2fe0e0efe26d9b115348529cee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:36 -0700 Subject: blkcg: Introduce blkg_root_lookup() This new function will be used in a later patch to verify whether a queue has been dissociated from the cgroup controller before being released. Signed-off-by: Bart Van Assche Cc: Tejun Heo Cc: Christoph Hellwig Cc: Ming Lei Cc: Omar Sandoval Cc: Johannes Thumshirn Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f7b910768306..1361cfc9b878 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -341,6 +341,23 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, return __blkg_lookup(blkcg, q, false); } +/** + * blkg_lookup - look up blkg for the specified request queue + * @q: request_queue of interest + * + * Lookup blkg for @q at the root level. See also blkg_lookup(). + */ +static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) +{ + struct blkcg_gq *blkg; + + rcu_read_lock(); + blkg = blkg_lookup(&blkcg_root, q); + rcu_read_unlock(); + + return blkg; +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -864,6 +881,7 @@ static inline bool blk_cgroup_congested(void) { return false; } static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } -- cgit v1.2.3 From b86d865cb1cae1e61527ea0b8977078bbf694328 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 10 Aug 2018 13:28:07 -0700 Subject: blkcg: Make blkg_root_lookup() work for queues in bypass mode For legacy queues the only call of blkg_root_lookup() happens after bypass mode has been enabled. Since blkg_lookup() returns NULL for queues in bypass mode, modify the blkg_root_lookup() such that it no longer depends on bypass mode. Rename the function into blk_queue_root_blkg() as suggested by Tejun. Suggested-by: Tejun Heo Fixes: 6bad9b210a22 ("blkcg: Introduce blkg_root_lookup()") Signed-off-by: Bart Van Assche Cc: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1361cfc9b878..34aec30e06c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -342,20 +342,14 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** - * blkg_lookup - look up blkg for the specified request queue + * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair * @q: request_queue of interest * * Lookup blkg for @q at the root level. See also blkg_lookup(). */ -static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) { - struct blkcg_gq *blkg; - - rcu_read_lock(); - blkg = blkg_lookup(&blkcg_root, q); - rcu_read_unlock(); - - return blkg; + return q->root_blkg; } /** @@ -881,7 +875,8 @@ static inline bool blk_cgroup_congested(void) { return false; } static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; } +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } -- cgit v1.2.3