From 1ed8d48c57bf7400eac7b8dc622ab0413715cafb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:52 -0700 Subject: writeback: bdi_for_each_wb() iteration is memcg ID based not blkcg wb's (bdi_writeback's) are currently keyed by memcg ID; however, in an earlier implementation, wb's were keyed by blkcg ID. bdi_for_each_wb() walks bdi->cgwb_tree in the ascending ID order and allows iterations to start from an arbitrary ID which is used to interrupt and resume iterations. Unfortunately, while changing wb to be keyed by memcg ID instead of blkcg, bdi_for_each_wb() was missed and is still assuming that wb's are keyed by blkcg ID. This doesn't affect iterations which don't get interrupted but bdi_split_work_to_wbs() makes use of iteration resuming on allocation failures and thus may incorrectly skip or repeat wb's. Fix it by changing bdi_for_each_wb() to take memcg IDs instead of blkcg IDs and updating bdi_split_work_to_wbs() accordingly. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..23ebb946e66f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) } struct wb_iter { - int start_blkcg_id; + int start_memcg_id; struct radix_tree_iter tree_iter; void **slot; }; @@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, WARN_ON_ONCE(!rcu_read_lock_held()); - if (iter->start_blkcg_id >= 0) { - iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); - iter->start_blkcg_id = -1; + if (iter->start_memcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); + iter->start_memcg_id = -1; } else { iter->slot = radix_tree_next_slot(iter->slot, titer, 0); } @@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, struct backing_dev_info *bdi, - int start_blkcg_id) + int start_memcg_id) { - iter->start_blkcg_id = start_blkcg_id; + iter->start_memcg_id = start_memcg_id; - if (start_blkcg_id) + if (start_memcg_id) return __wb_iter_next(iter, bdi); else return &bdi->wb; } /** - * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order * @wb_cur: cursor struct bdi_writeback pointer * @bdi: bdi to walk wb's of * @iter: pointer to struct wb_iter to be used as iteration buffer - * @start_blkcg_id: blkcg ID to start iteration from + * @start_memcg_id: memcg ID to start iteration from * * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending - * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter * to be used as temp storage during iteration. rcu_read_lock() must be * held throughout iteration. */ -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ - for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) #else /* CONFIG_CGROUP_WRITEBACK */ -- cgit v1.2.3 From 9acee9c551f045d2c5b5261aa587331423fd7d92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:55 -0700 Subject: kernfs: implement kernfs_path_len() Add a function to determine the path length of a kernfs node. This for now will be used by writeback tracepoint updates. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- include/linux/kernfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +size_t kernfs_path_len(struct kernfs_node *kn); char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } +static inline size_t kernfs_path_len(struct kernfs_node *kn) +{ return 0; } + static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } -- cgit v1.2.3 From 401efbf835040dd2ebca54f78d58fc8e3c51f91d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:06 -0700 Subject: blkcg: remove unnecessary request_list->blkg NULL test in blk_put_rl() Since ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg"), a request_list always has its blkg associated. Drop unnecessary rl->blkg NULL test from blk_put_rl(). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1b62d768c7df..9711fc277c02 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -394,8 +394,7 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } -- cgit v1.2.3 From 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:09 -0700 Subject: blkcg: restructure blkg_policy_data allocation in blkcg_activate_policy() When a policy gets activated, it needs to allocate and install its policy data on all existing blkg's (blkcg_gq's). Because blkg iteration is protected by a spinlock, it currently counts the total number of blkg's in the system, allocates the matching number of policy data on a list and installs them during a single iteration. This can be simplified by using speculative GFP_NOWAIT allocations while iterating and falling back to a preallocated policy data on failure. If the preallocated one has already been consumed, it releases the lock, preallocate with GFP_KERNEL and then restarts the iteration. This can be a bit more expensive than before but policy activation is a very cold path and shouldn't matter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9711fc277c02..db822880242a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -80,9 +80,6 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* -- cgit v1.2.3 From 001bea73e70efdf48a9e00188cf302f6b6aed2bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:11 -0700 Subject: blkcg: replace blkcg_policy->pd_size with ->pd_alloc/free_fn() methods A blkg (blkcg_gq) represents the relationship between a cgroup and request_queue. Each active policy has a pd (blkg_policy_data) on each blkg. The pd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->pd_size larger than the size of pd. This is a bit unusual but was done this way mostly to simplify error handling and all the existing use cases could be handled this way; however, this is becoming too restrictive now that percpu memory can be allocated without blocking. This introduces two new mandatory blkcg_policy methods - pd_alloc_fn() and pd_free_fn() - which are used to allocate and release pd for a given policy. As pd allocation is now done from policy side, it can simply allocate a larger area which embeds pd at the beginning. This change makes ->pd_size pointless. Removed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db822880242a..bd173ea360ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -68,13 +68,11 @@ struct blkg_rwstat { * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ @@ -126,16 +124,16 @@ struct blkcg_gq { }; typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); struct blkcg_policy { int plid; - /* policy specific private data size */ - size_t pd_size; /* policy specific per-blkcg data size */ size_t cpd_size; /* cgroup files for the policy */ @@ -143,10 +141,12 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; -- cgit v1.2.3 From b2ce2643cc705aa9043642d7b6248ccfd8e20629 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:13 -0700 Subject: blk-throttle: clean up blkg_policy_data alloc/init/exit/free methods With the recent addition of alloc and free methods, things became messier. This patch reorganizes them according to the followings. * ->pd_alloc_fn() Responsible for allocation and static initializations - the ones which can be done independent of where the pd might be attached. * ->pd_init_fn() Initializations which require the knowledge of where the pd is attached. * ->pd_free_fn() The counter part of pd_alloc_fn(). Static de-init and freeing. This leaves ->pd_exit_fn() without any users. Removed. While at it, collapse an one liner function throtl_pd_exit(), which has only one user, into its user. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bd173ea360ce..9879469b1b38 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -128,7 +128,6 @@ typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -145,7 +144,6 @@ struct blkcg_policy { blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; -- cgit v1.2.3 From a9520cd6f2ac1fbbf206b915946534c6dddbaae2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:14 -0700 Subject: blkcg: make blkcg_policy methods take a pointer to blkcg_policy_data The newly added ->pd_alloc_fn() and ->pd_free_fn() deal with pd (blkg_policy_data) while the older ones use blkg (blkcg_gq). As using blkg doesn't make sense for ->pd_alloc_fn() and after allocation pd can always be mapped to blkg and given that these are policy-specific methods, it makes sense to converge on pd. This patch makes all methods deal with pd instead of blkg. Most conversions are trivial. In blk-cgroup.c, a couple method invocation sites now test whether pd exists instead of policy state for consistency. This shouldn't cause any behavioral differences. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9879469b1b38..ddd4b8b252c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -125,11 +125,11 @@ struct blkcg_gq { typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; -- cgit v1.2.3 From 814376483e7d85b69a70634633f1f9d01c6ee0cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:15 -0700 Subject: blkcg: minor updates around blkcg_policy_data * Rename blkcg->pd[] to blkcg->cpd[] so that cpd is consistently used for blkcg_policy_data. * Make blkcg_policy->cpd_init_fn() take blkcg_policy_data instead of blkcg. This makes it consistent with blkg_policy_data methods and to-be-added cpd alloc/free methods. * blkcg_policy_data->blkcg and cpd_to_blkcg() added so that cpd_init_fn() can determine the associated blkcg from blkcg_policy_data. v2: blkcg_policy_data->blkcg initializations were missing. Added. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index ddd4b8b252c7..7988d4749fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -45,7 +45,7 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK @@ -88,7 +88,8 @@ struct blkg_policy_data { * each policy handle per-blkcg data. */ struct blkcg_policy_data { - /* the policy id this per-policy data belongs to */ + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; int plid; }; @@ -123,7 +124,7 @@ struct blkcg_gq { struct rcu_head rcu_head; }; -typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -243,7 +244,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { - return blkcg ? blkcg->pd[pol->plid] : NULL; + return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** @@ -257,6 +258,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) return pd ? pd->blkg : NULL; } +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest -- cgit v1.2.3 From e4a9bde9589fdc51283755cdd75d47b27ca7c6fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:16 -0700 Subject: blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn() methods Each active policy has a cpd (blkcg_policy_data) on each blkcg. The cpd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->cpd_size larger than the size of cpd. This is a bit unusual but blkg (blkcg_gq) policy data used to be handled this way too so it made sense to be consistent; however, blkg policy data switched to alloc/free callbacks. This patch makes similar changes to cpd handling. blkcg_policy->cpd_alloc/free_fn() are added to replace ->cpd_size. As cpd allocation is now done from policy side, it can simply allocate a larger area which embeds cpd at the beginning. As ->cpd_alloc_fn() may be able to perform all necessary initializations, this patch makes ->cpd_init_fn() optional. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7988d4749fff..15f2382bc723 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -81,11 +81,11 @@ struct blkg_policy_data { }; /* - * Policies that need to keep per-blkcg data which is independent - * from any request_queue associated to it must specify its size - * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. cpd_init() is invoked to let - * each policy handle per-blkcg data. + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ @@ -124,7 +124,9 @@ struct blkcg_gq { struct rcu_head rcu_head; }; +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -134,13 +136,14 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; - /* policy specific per-blkcg data size */ - size_t cpd_size; /* cgroup files for the policy */ struct cftype *cftypes; /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; -- cgit v1.2.3 From 24f290466f79a6497f1654f64b9a841872cba3ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:17 -0700 Subject: blkcg: inline [__]blkg_lookup() blkg_lookup() checks whether the target queue is bypassing and, if not, calls __blkg_lookup() which first checks the lookup hint and then performs radix tree walk. The operations upto hint checking are trivial and there are many users of this function. This patch inlines blkg_lookup() and the fast path part of __blkg_lookup(). The radix tree lookup and hint update are now in blkg_lookup_slowpath(). This will help consolidating blkg handling by easing moving root blkcg short-circuit to inlined lookup fast path. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 49 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 15f2382bc723..d5b54aa50582 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -155,7 +155,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -231,6 +232,49 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) return css_to_blkcg(blkcg->css.parent); } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -313,9 +357,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant -- cgit v1.2.3 From 85b6bc9db6d5ab6980b43c38b5cbd11d24414ce4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:18 -0700 Subject: blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() Currently, both throttle and cfq policies implement their own root blkg (blkcg_gq) lookup fast path. This patch moves root blkg optimization from throtl_lookup_tg() to __blkg_lookup(). cfq-iosched currently doesn't use blkg_lookup() but will be converted and drop the optimization too. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index d5b54aa50582..0609bce69f68 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -249,6 +249,9 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, { struct blkcg_gq *blkg; + if (blkcg == &blkcg_root) + return q->root_blkg; + blkg = rcu_dereference(blkcg->blkg_hint); if (blkg && blkg->q == q) return blkg; -- cgit v1.2.3 From ae11889636111199dbcf47283b4167f578b69472 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:20 -0700 Subject: blkcg: consolidate blkg creation in blkcg_bio_issue_check() blkg (blkcg_gq) currently is created by blkcg policies invoking blkg_lookup_create() which ends up repeating about the same code in different policies. Theoretically, this can avoid the overhead of looking and/or creating blkg's if blkcg is enabled but no policy is in use; however, the cost of blkg lookup / creation is very low especially if only the root blkcg is in use which is highly likely if no blkcg policy is in active use - it boils down to a single very predictable conditional and surrounding RCU protection. This patch consolidates blkg creation to a new function blkcg_bio_issue_check() which is called during bio issue from generic_make_request_checks(). blkcg_bio_issue_check() is now the only function which tries to create missing blkg's. The subsequent policy and request_list operations just perform blkg_lookup() and if missing falls back to the root. * blk_get_rl() no longer tries to create blkg. It uses blkg_lookup() instead of blkg_lookup_create(). * blk_throtl_bio() is now called from blkcg_bio_issue_check() with rcu read locked and blkg already looked up. Both throtl_lookup_tg() and throtl_lookup_create_tg() are dropped. * cfq is similarly updated. cfq_lookup_create_cfqg() is replaced with cfq_lookup_cfqg()which uses blkg_lookup(). This consolidates blkg handling and avoids unnecessary blkg creation retries under memory pressure. In addition, this provides a common bio entry point into blkcg where things like common accounting can be performed. v2: Build fixes for !CONFIG_CFQ_GROUP_IOSCHED and !CONFIG_BLK_DEV_THROTTLING. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0609bce69f68..4d1659c7f84b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -421,8 +421,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * or if either the blkcg or queue is going away. Fall back to * root_rl in such cases. */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) goto root_rl; blkg_get(blkg); @@ -636,6 +636,39 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to, u64_stats_update_end(&to->syncp); } +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio); +#else +static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { return false; } +#endif + +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool throtl = false; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } + + throtl = blk_throtl_bio(q, blkg, bio); + + rcu_read_unlock(); + return !throtl; +} + #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -689,6 +722,9 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) { return true; } + #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -- cgit v1.2.3 From e6269c44546755094979ab53609e6e203a68c8ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:21 -0700 Subject: blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it cgroup stats are local to each cgroup and doesn't propagate to ancestors by default. When recursive stats are necessary, the sum is calculated over all the descendants. This initially was for backward compatibility to support both group-local and recursive stats but this mode of operation makes general sense as stat update is much hotter thafn reporting those stats. This however ends up losing recursive stats when a child is removed. To work around this, cfq-iosched adds its stats to its parent cfq_group->dead_stats which is summed up together when calculating recursive stats. It's planned that the core stats will be moved to blkcg_gq, so we want to move the mechanism for keeping track of the stats of dead children from cfq to blkcg core. This patch adds blkg_[rw]stat->aux_cnt which are atomic64_t's keeping track of auxiliary counts which are excluded when reading local counts but included for recursive. blkg_[rw]stat_merge() which were used by cfq to implement dead_stats are replaced by blkg_[rw]stat_add_aux(), and cfq now forwards stats of a dead cgroup to the aux counts of parent->stats instead of separate ->dead_stats. This will also help making blkg_[rw]stats per-cpu. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4d1659c7f84b..e8092276af58 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,14 +53,20 @@ struct blkcg { #endif }; +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ struct blkg_stat { struct u64_stats_sync syncp; uint64_t cnt; + atomic64_t aux_cnt; }; struct blkg_rwstat { struct u64_stats_sync syncp; uint64_t cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; /* @@ -483,6 +489,7 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, static inline void blkg_stat_init(struct blkg_stat *stat) { u64_stats_init(&stat->syncp); + atomic64_set(&stat->aux_cnt, 0); } /** @@ -504,8 +511,9 @@ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. + * Read the current value of @stat. The returned value doesn't include the + * aux count. This function can be called without synchroniztion and takes + * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { @@ -527,23 +535,31 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) static inline void blkg_stat_reset(struct blkg_stat *stat) { stat->cnt = 0; + atomic64_set(&stat->aux_cnt, 0); } /** - * blkg_stat_merge - merge a blkg_stat into another + * blkg_stat_add_aux - add a blkg_stat into another's aux count * @to: the destination blkg_stat * @from: the source * - * Add @from's count to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +static inline void blkg_stat_add_aux(struct blkg_stat *to, + struct blkg_stat *from) { - blkg_stat_add(to, blkg_stat_read(from)); + atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); } static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) { + int i; + u64_stats_init(&rwstat->syncp); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** @@ -614,26 +630,30 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { + int i; + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** - * blkg_rwstat_merge - merge a blkg_rwstat into another + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * - * Add @from's counts to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) { struct blkg_rwstat v = blkg_rwstat_read(from); int i; - u64_stats_update_begin(&to->syncp); for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); + atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); } #ifdef CONFIG_BLK_DEV_THROTTLING -- cgit v1.2.3 From 24bdb8ef068ebdc2a57ce715f0ab22d5da32832a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:22 -0700 Subject: blkcg: make blkcg_[rw]stat per-cpu blkcg_[rw]stat are used as stat counters for blkcg policies. It isn't per-cpu by itself and blk-throttle makes it per-cpu by wrapping around it. This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc per-cpu wrapping in blk-throttle. * blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct percpu_counter. This makes syncp unnecessary as remote accesses are handled by percpu_counter itself. * blkg_[rw]stat_init() can now fail due to percpu allocation failure and thus are updated to return int. * percpu_counters need explicit freeing. blkg_[rw]stat_exit() added. * As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading and summing results are stored in ->aux_cnt[] instead. * Custom per-cpu stat implementation in blk-throttle is removed. This makes all blkcg stat counters per-cpu without complicating policy implmentations. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 120 +++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e8092276af58..fdc7ac08b1ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,12 +14,15 @@ */ #include -#include +#include #include #include #include #include +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -55,17 +58,16 @@ struct blkcg { /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. + * recursive. Used to carry stats of dead children, and, for blkg_rwstat, + * to carry result values from read and sum operations. */ struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; + struct percpu_counter cpu_cnt; atomic64_t aux_cnt; }; struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; @@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) -static inline void blkg_stat_init(struct blkg_stat *stat) +static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { - u64_stats_init(&stat->syncp); + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static inline void blkg_stat_exit(struct blkg_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); } /** @@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) * @stat: target blkg_stat * @val: value to add * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); + __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read - * - * Read the current value of @stat. The returned value doesn't include the - * aux count. This function can be called without synchroniztion and takes - * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; + return percpu_counter_sum_positive(&stat->cpu_cnt); } /** @@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) */ static inline void blkg_stat_reset(struct blkg_stat *stat) { - stat->cnt = 0; + percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } @@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to, &to->aux_cnt); } -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { - int i; + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} - u64_stats_init(&rwstat->syncp); +static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&rwstat->aux_cnt[i], 0); + percpu_counter_destroy(&rwstat->cpu_cnt[i]); } /** @@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, int rw, uint64_t val) { - u64_stats_update_begin(&rwstat->syncp); + struct percpu_counter *cnt; if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else - rwstat->cnt[BLKG_RWSTAT_READ] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - u64_stats_update_end(&rwstat->syncp); + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. + * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) { - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + struct blkg_rwstat result; + int i; - return tmp; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&result.aux_cnt[i], + percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + return result; } /** @@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } /** @@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); + } } /** @@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + atomic64_add(atomic64_read(&v.aux_cnt[i]) + + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } -- cgit v1.2.3 From f12c74cab1635d67077ce8cc40da88b57980f637 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:23 -0700 Subject: blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq Currently, blkg_[rw]stat_recursive_sum() assume that the target counter is located in pd (blkg_policy_data); however, some counters are planned to be moved to blkg (blkcg_gq). This patch updates blkg_[rw]stat_recursive_sum() to take blkg and blkg_policy pointers instead of pd. If policy is NULL, it indexes into blkg. If non-NULL, into the blkg's pd of the policy. The existing usages are updated to maintain the current behaviors. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index fdc7ac08b1ce..4630ce8f9425 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -191,9 +191,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); struct blkg_conf_ctx { struct gendisk *disk; -- cgit v1.2.3 From 77ea733884eb5520f22c36def1309fe2ab61633e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:24 -0700 Subject: blkcg: move io_service_bytes and io_serviced stats into blkcg_gq Currently, both cfq-iosched and blk-throttle keep track of io_service_bytes and io_serviced stats. While keeping track of them separately may be useful during development, it doesn't make much sense otherwise. Also, blk-throttle was counting bio's as IOs while cfq-iosched request's, which is more confusing than informative. This patch adds ->stat_bytes and ->stat_ios to blkg (blkcg_gq), removes the counterparts from cfq-iosched and blk-throttle and let them print from the common blkg counters. The common counters are incremented during bio issue in blkcg_bio_issue_check(). The outputs are still filtered by whether the policy has blkg_policy_data on a given blkg, so cfq's output won't show up if it has never been used for a given blkg. The only times when the outputs would differ significantly are when policies are attached on the fly or elevators are switched back and forth. Those are quite exceptional operations and I don't think they warrant keeping separate counters. v3: Update blkio-controller.txt accordingly. v2: Account IOs during bio issues instead of request completions so that bio-based drivers can be handled the same way. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4630ce8f9425..286e1bde249f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -127,6 +127,9 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; @@ -190,6 +193,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +int blkg_print_stat_bytes(struct seq_file *sf, void *v); +int blkg_print_stat_ios(struct seq_file *sf, void *v); +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off); @@ -700,6 +707,13 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); + if (!throtl) { + blkg = blkg ?: q->root_blkg; + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, + bio->bi_iter.bi_size); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); + } + rcu_read_unlock(); return !throtl; } -- cgit v1.2.3 From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:29 -0700 Subject: blkcg: rename subsystem name from blkio to io blkio interface has become messy over time and is currently the largest. In addition to the inconsistent naming scheme, it has multiple stat files which report more or less the same thing, a number of debug stat files which expose internal details which shouldn't have been part of the public interface in the first place, recursive and non-recursive stats and leaf and non-leaf knobs. Both recursive vs. non-recursive and leaf vs. non-leaf distinctions don't make any sense on the unified hierarchy as only leaf cgroups can contain processes. cgroups is going through a major interface revision with the unified hierarchy involving significant fundamental usage changes and given that a significant portion of the interface doesn't make sense anymore, it's a good time to reorganize the interface. As the first step, this patch renames the external visible subsystem name from "blkio" to "io". This is more concise, matches the other two major subsystem names, "cpu" and "memory", and better suited as blkcg will be involved in anything writeback related too whether an actual block device is involved or not. As the subsystem legacy_name is set to "blkio", the only userland visible change outside the unified hierarchy is that blkcg is reported as "io" instead of "blkio" in the subsystem initialized message during boot. On the unified hierarchy, blkcg now appears as "io". Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- include/linux/blk-cgroup.h | 4 ++-- include/linux/cgroup_subsys.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 23ebb946e66f..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 286e1bde249f..db89acd2a864 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..86b5056104df 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -16,7 +16,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) -- cgit v1.2.3 From 880f50e228f80626dff6327a6e281e40286f5228 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:30 -0700 Subject: blkcg: mark existing cftypes as legacy blkcg is about to grow interface for the unified hierarchy. Add legacy to existing cftypes. * blkcg_policy->cftypes -> blkcg_policy->legacy_cftypes * blk-cgroup.c:blkcg_files -> blkcg_legacy_files * cfq-iosched.c:cfq_blkcg_files -> cfq_blkcg_legacy_files * blk-throttle.c:throtl_files -> throtl_legacy_files Pure renames. No functional change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db89acd2a864..6e016e6fee87 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,7 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ - struct cftype *cftypes; + struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; -- cgit v1.2.3 From 36aa9e5f591e84d67aad2c5bff75e413d77660dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:31 -0700 Subject: blkcg: move body parsing from blkg_conf_prep() to its callers Currently, blkg_conf_prep() expects input to be of the following form MAJ:MIN NUM and reads the NUM part into blkg_conf_ctx->v. This is quite restrictive and gets in the way in implementing blkcg interface for the unified hierarchy. This patch updates blkg_conf_prep() so that it expects MAJ:MIN BODY_STR where BODY_STR is an arbitrary string. blkg_conf_ctx->v is replaced with ->body which is a char pointer pointing to the start of BODY_STR. Parsing of the body is moved to blkg_conf_prep()'s callers. To allow using, for example, strsep() on blkg_conf_ctx->val, it is a non-const pointer and to accommodate that const is dropped from @input too. This doesn't cause any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6e016e6fee87..85a4d989ae43 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -206,11 +206,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; - u64 v; + char *body; }; int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); + char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -- cgit v1.2.3 From dd165eb3bb4ef16bcdb75417add40633f38c52b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:33 -0700 Subject: blkcg: misc preparations for unified hierarchy interface * Export blkg_dev_name() * Drop unnecessary @cft from __cfq_set_weight(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 85a4d989ae43..b270aef519c6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -182,6 +182,7 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), -- cgit v1.2.3 From 2ee867dcfa2eaef1063b686da55c35878b2da4a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:34 -0700 Subject: blkcg: implement interface for the unified hierarchy blkcg interface grew to be the biggest of all controllers and unfortunately most inconsistent too. The interface files are inconsistent with a number of cloes duplicates. Some files have recursive variants while others don't. There's distinction between normal and leaf weights which isn't intuitive and there are a lot of stat knobs which don't make much sense outside of debugging and expose too much implementation details to userland. In the unified hierarchy, everything is always hierarchical and internal nodes can't have tasks rendering the two structural issues twisting the current interface. The interface has to be updated in a significant anyway and this is a good chance to revamp it as a whole. This patch implements blkcg interface for the unified hierarchy. * (from a previous patch) blkcg is identified by "io" instead of "blkio" on the unified hierarchy. Given that the whole interface is updated anyway, the rename shouldn't carry noticeable conversion overhead. * The original interface consisted of 27 files is replaced with the following three files. blkio.stat : per-blkcg stats blkio.weight : per-cgroup and per-cgroup-queue weight settings blkio.max : per-cgroup-queue bps and iops max limits Documentation/cgroups/unified-hierarchy.txt updated accordingly. v2: blkcg_policy->dfl_cftypes wasn't removed on blkcg_policy_unregister() corrupting the cftypes list. Fixed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b270aef519c6..9a7c4bd45fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ + struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ -- cgit v1.2.3 From 69d7fde5909b614114343974cfc52cb8ff30b544 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:36 -0700 Subject: blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy cgroup is trying to make interface consistent across different controllers. For weight based resource control, the knob should have the range [1, 10000] and default to 100. This patch updates cfq-iosched so that the weight range conforms. The internal calculations have enough range and the widening of the weight range shouldn't cause any problem. * blkcg_policy->cpd_bind_fn() is added. If present, this is invoked when blkcg is attached to a hierarchy. * cfq_cpd_init() is updated to use the new default value on the unified hierarchy. * cfq_cpd_bind() callback is implemented to clear per-blkg configs and apply the default config matching the hierarchy type. * cfqd->root_group->[leaf_]weight initialization in cfq_init_queue() is moved into !CONFIG_CFQ_GROUP_IOSCHED block. cfq_cpd_bind() is now responsible for initializing the initial weights when blkcg is enabled. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9a7c4bd45fff..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -138,6 +138,7 @@ struct blkcg_gq { typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -155,6 +156,7 @@ struct blkcg_policy { blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; -- cgit v1.2.3