From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8056c03a3382..628e50f6f8a8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -181,14 +181,13 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx); static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + return container_of(cgroup_css(cgroup, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); + return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *bio_blkcg(struct bio *bio) -- cgit v1.2.3 From a7c6d554aa01236ac2a9f851ab0f75704f76dfa2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add/update accessors which obtain subsys specific data from css css (cgroup_subsys_state) is usually embedded in a subsys specific data structure. Subsystems either use container_of() directly to cast from css to such data structure or has an accessor function wrapping such cast. As cgroup as whole is moving towards using css as the main interface handle, add and update such accessors to ease dealing with css's. All accessors explicitly handle NULL input and return NULL in those cases. While this looks like an extra branch in the code, as all controllers specific data structures have css as the first field, the casting doesn't involve any offsetting and the compiler can trivially optimize out the branch. * blkio, freezer, cpuset, cpu, cpuacct and net_cls didn't have such accessor. Added. * memory, hugetlb and devices already had one but didn't explicitly handle NULL input. Updated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 628e50f6f8a8..8e5863e900bf 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -179,21 +179,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, blkio_subsys_id), - struct blkcg, css); + return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); + return css_to_blkcg(task_css(tsk, blkio_subsys_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) { if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); + return css_to_blkcg(bio->bi_css); return task_blkcg(current); } -- cgit v1.2.3 From 6387698699afd72d6304566fb6ccf84bffe07c56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add css_parent() Currently, controllers have to explicitly follow the cgroup hierarchy to find the parent of a given css. cgroup is moving towards using cgroup_subsys_state as the main controller interface construct, so let's provide a way to climb the hierarchy using just csses. This patch implements css_parent() which, given a css, returns its parent. The function is guarnateed to valid non-NULL parent css as long as the target css is not at the top of the hierarchy. freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices are converted to use css_parent() instead of accessing cgroup->parent directly. * __parent_ca() is dropped from cpuacct and its usage is replaced with parent_ca(). The only difference between the two was NULL test on cgroup->parent which is now embedded in css_parent() making the distinction moot. Note that eventually a css->parent field will be added to css and the NULL check in css_parent() will go away. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8e5863e900bf..b6802c46d68f 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -209,9 +209,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { - struct cgroup *pcg = blkcg->css.cgroup->parent; - - return pcg ? cgroup_to_blkcg(pcg) : NULL; + return css_to_blkcg(css_parent(&blkcg->css)); } /** -- cgit v1.2.3 From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- block/blk-cgroup.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 290792a13e3c..79fd9f4fadb7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -765,18 +765,18 @@ struct cftype blkcg_files[] = { /** * blkcg_css_offline - cgroup css_offline callback - * @cgroup: cgroup of interest + * @css: css of interest * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be * removed while holding both q and blkcg locks. As blkcg lock is nested * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkcg_css_offline(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); spin_lock_irq(&blkcg->lock); @@ -798,21 +798,21 @@ static void blkcg_css_offline(struct cgroup *cgroup) spin_unlock_irq(&blkcg->lock); } -static void blkcg_css_free(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); if (blkcg != &blkcg_root) kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; - if (!parent) { + if (!parent_css) { blkcg = &blkcg_root; goto done; } @@ -883,14 +883,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) -- cgit v1.2.3 From 2bb566cb68dfafad328af666ebadf0e49accd6ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add subsys backlink pointer to cftype cgroup is transitioning to using css (cgroup_subsys_state) instead of cgroup as the primary subsystem handle. The cgroupfs file interface will be converted to use css's which requires finding out the subsystem from cftype so that the matching css can be determined from the cgroup. This patch adds cftype->ss which points to the subsystem the file belongs to. The field is initialized while a cftype is being registered. This makes it unnecessary to explicitly specify the subsystem for other cftype handling functions. @ss argument dropped from various cftype handling functions. This patch shouldn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Cc: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79fd9f4fadb7..34063739745b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1128,7 +1128,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* kill the intf files first */ if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + cgroup_rm_cftypes(pol->cftypes); /* unregister and update blkgs */ blkcg_policy[pol->plid] = NULL; -- cgit v1.2.3 From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- block/blk-cgroup.c | 6 ++-- block/blk-throttle.c | 32 +++++++++---------- block/cfq-iosched.c | 90 ++++++++++++++++++++++++++-------------------------- 3 files changed, 64 insertions(+), 64 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 34063739745b..f46f3c69179c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, return &blkg->rl; } -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 08a32dfd3844..88bcfb651b0b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, cft->private, true); @@ -1325,26 +1325,26 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_uint(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, - bool is_u64) +static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, + const char *buf, bool is_u64) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct throtl_grp *tg; struct throtl_service_queue *sq; @@ -1403,16 +1403,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, return 0; } -static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, true); + return tg_set_conf(css, cft, buf, true); } -static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, false); + return tg_set_conf(css, cft, buf, false); } static struct cftype throtl_files[] = { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d5bbdcfd0dab..dabb9d02cf9a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } @@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, +static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } -static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, +static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", - cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); return 0; } -static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf, bool is_leaf_weight) +static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf, + bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; int ret; @@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, false); + return __cfqg_set_weight_device(css, cft, buf, false); } -static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, true); + return __cfqg_set_weight_device(css, cft, buf, true); } -static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, - bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val, bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) @@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, return 0; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - return __cfq_set_weight(cgrp, cft, val, false); + return __cfq_set_weight(css, cft, val, false); } -static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - return __cfq_set_weight(cgrp, cft, val, true); + return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, +static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, cft->private, true); @@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, cft->private, true); @@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 0, false); -- cgit v1.2.3 From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe --- block/blk-cgroup.c | 8 ++++---- block/blk-cgroup.h | 25 +++++++++---------------- block/blk-throttle.c | 8 ++++---- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f46f3c69179c..4b40640240a4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -614,7 +614,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; u64 sum; lockdep_assert_held(pd->blkg->q->queue_lock); @@ -622,7 +622,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) sum = blkg_stat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_stat *stat = (void *)pos_pd + off; @@ -649,7 +649,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum; int i; @@ -658,7 +658,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, sum = blkg_rwstat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_rwstat *rwstat = (void *)pos_pd + off; struct blkg_rwstat tmp; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b6802c46d68f..855538630300 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -184,11 +184,6 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) -{ - return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); -} - static inline struct blkcg *task_blkcg(struct task_struct *tsk) { return css_to_blkcg(task_css(tsk, blkio_subsys_id)); @@ -289,32 +284,31 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip - * subtree. + * update @pos_css by calling css_rightmost_descendant() to skip subtree. */ -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. */ -#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** @@ -577,7 +571,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 88bcfb651b0b..8cefa7f8590e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1349,7 +1349,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, struct throtl_grp *tg; struct throtl_service_queue *sq; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); @@ -1380,7 +1380,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * blk-throttle. */ tg_update_has_rules(tg); - blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1623,7 +1623,7 @@ void blk_throtl_drain(struct request_queue *q) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct bio *bio; int rw; @@ -1636,7 +1636,7 @@ void blk_throtl_drain(struct request_queue *q) * better to walk service_queue tree directly but blkg walk is * easier. */ - blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); tg_drain_bios(&td_root_tg(td)->service_queue); -- cgit v1.2.3 From d99c8727e7bbc01b70e2c57e6127bfab26b868fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make cgroup_taskset deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cgroup_taskset which is used by the subsystem attach methods is the last cgroup subsystem API which isn't using css as the handle. Update cgroup_taskset_cur_cgroup() to cgroup_taskset_cur_css() and cgroup_taskset_for_each() to take @skip_css instead of @skip_cgrp. The conversions are pretty mechanical. One exception is cpuset::cgroup_cs(), which lost its last user and got removed. This patch shouldn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Daniel Wagner Cc: Ingo Molnar Cc: Matt Helsley Cc: Steven Rostedt --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b40640240a4..54ad00292edf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -891,7 +891,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) -- cgit v1.2.3 From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- block/blk-cgroup.c | 8 ++------ block/blk-cgroup.h | 4 +++- block/blk-throttle.c | 3 --- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 54ad00292edf..e90c7c164c83 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -615,12 +615,10 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - u64 sum; + u64 sum = 0; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_stat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); @@ -650,13 +648,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - struct blkg_rwstat sum; + struct blkg_rwstat sum = { }; int i; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_rwstat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 855538630300..ae6969a7ffd4 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -291,6 +291,7 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ @@ -304,7 +305,8 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8cefa7f8590e..8331aba9426f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1379,7 +1379,6 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - tg_update_has_rules(tg); blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); @@ -1639,8 +1638,6 @@ void blk_throtl_drain(struct request_queue *q) blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - tg_drain_bios(&td_root_tg(td)->service_queue); - /* finally, transfer bios from top-level tg's into the td */ tg_drain_bios(&td->service_queue); -- cgit v1.2.3 From a9d6ceb838755c24dde8a0ca02c3378926fc63db Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 1 Jul 2013 15:16:25 +0200 Subject: [SCSI] return ENOSPC on thin provisioning failure When the thin provisioning hard threshold is reached we should return ENOSPC to inform upper layers about this fact. Signed-off-by: Hannes Reinecke Signed-off-by: James Bottomley --- block/blk-core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 93a18d1d3da8..68ce4d53a528 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2318,6 +2318,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) case -ETIMEDOUT: error_type = "timeout"; break; + case -ENOSPC: + error_type = "critical space allocation"; + break; case -EIO: default: error_type = "I/O"; -- cgit v1.2.3 From 7e782af57649f8a8e943d80104c946a5cd7af7cc Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 1 Jul 2013 15:16:26 +0200 Subject: [SCSI] Return ENODATA on medium error When a medium error is detected the SCSI stack should return ENODATA to the upper layers. [jejb: fix whitespace error] Signed-off-by: Hannes Reinecke Signed-off-by: James Bottomley --- block/blk-core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 68ce4d53a528..c04505358342 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2321,6 +2321,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) case -ENOSPC: error_type = "critical space allocation"; break; + case -ENODATA: + error_type = "critical medium"; + break; case -EIO: default: error_type = "I/O"; -- cgit v1.2.3 From 7aef2e780b13973ea60aed8c556107dabde6a495 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Wed, 11 Sep 2013 13:21:07 -0600 Subject: block: trace all devices plug operation In func blk_queue_bio, if list of plug is empty,it will call blk_trace_plug. If process deal with a single device,it't ok.But if process deal with multi devices,it only trace the first device. Using request_count to judge, it can soleve this problem. In addition, i modify the comment. Signed-off-by: Jianpeng Ma Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 93a18d1d3da8..91037f74668e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1549,11 +1549,9 @@ get_rq: if (plug) { /* * If this is the first request added after a plug, fire - * of a plug trace. If others have been added before, check - * if we have multiple devices in this plug. If so, make a - * note to sort the list before dispatch. + * of a plug trace. */ - if (list_empty(&plug->list)) + if (!request_count) trace_block_plug(q); else { if (request_count >= BLK_MAX_REQUEST_COUNT) { -- cgit v1.2.3 From c1b511eb211a6c72d66f7755d2b30a9a91ef9423 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 29 Aug 2013 15:21:42 -0700 Subject: block: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- block/deadline-iosched.c | 2 +- block/elevator.c | 2 +- block/genhd.c | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d5bbdcfd0dab..f0468e252ee4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4358,7 +4358,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) if (!eq) return -ENOMEM; - cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); + cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); if (!cfqd) { kobject_put(&eq->kobj); return -ENOMEM; diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 20614a332362..9ef66406c625 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -346,7 +346,7 @@ static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) if (!eq) return -ENOMEM; - dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) { kobject_put(&eq->kobj); return -ENOMEM; diff --git a/block/elevator.c b/block/elevator.c index 668394d18588..2bcbd8cc14d4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -155,7 +155,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, { struct elevator_queue *eq; - eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); + eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); if (unlikely(!eq)) goto err; diff --git a/block/genhd.c b/block/genhd.c index dadf42b454a3..791f41943132 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1252,8 +1252,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - disk = kmalloc_node(sizeof(struct gendisk), - GFP_KERNEL | __GFP_ZERO, node_id); + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { if (!init_part_stats(&disk->part0)) { kfree(disk); -- cgit v1.2.3 From 577cee1e8db6b98b51506e956264b84553426e65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2013 14:26:50 -0400 Subject: blkcg: relocate root_blkg setting and clearing Hello, Jens. The original thread can be read from http://thread.gmane.org/gmane.linux.kernel.cgroups/8937 While it leads to oops, given that it only triggers under specific configurations which aren't common. I don't think it's necessary to backport it through -stable and merging it during the coming merge window should be enough. Thanks! ----- 8< ----- Currently, q->root_blkg and q->root_rl.blkg are set from blkcg_activate_policy() and cleared from blkg_destroy_all(). This doesn't necessarily coincide with the lifetime of the root blkcg_gq leading to the following oops when blkcg is enabled but no policy is activated because __blk_queue_next_rl() malfunctions expecting the root_blkg pointers to be set. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] __wake_up_common+0x2b/0x90 PGD 60f7a9067 PUD 60f4c9067 PMD 0 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC gsmi: Log Shutdown Reason 0x03 Modules linked in: act_mirred cls_tcindex cls_prioshift sch_dsmark xt_multiport iptable_mangle sata_mv elephant elephant_dev_num cdc_acm uhci_hcd ehci_hcd i2c_d CPU: 9 PID: 41382 Comm: iSCSI-write- Not tainted 3.11.0-dbg-DEV #19 Hardware name: Intel XXX task: ffff88060d16eec0 ti: ffff88060d170000 task.ti: ffff88060d170000 RIP: 0010:[] [] __wake_up_common+0x2b/0x90 RSP: 0000:ffff88060d171818 EFLAGS: 00010096 RAX: 0000000000000082 RBX: ffff880baa3dee60 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffff880baa3dee60 RBP: ffff88060d171858 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000002 R12: ffff880baa3dee98 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003 FS: 00007f977cba6700(0000) GS:ffff880c79c60000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000060f7a5000 CR4: 00000000000007e0 Stack: 0000000000000082 0000000000000000 ffff88060d171858 ffff880baa3dee60 0000000000000082 0000000000000003 0000000000000000 0000000000000000 ffff88060d171898 ffffffff810c7848 ffff88060d171888 ffff880bde4bc4b8 Call Trace: [] __wake_up+0x48/0x70 [] __blk_drain_queue+0x123/0x190 [] blk_cleanup_queue+0xf5/0x210 [] __scsi_remove_device+0x5a/0xd0 [] scsi_remove_device+0x34/0x50 [] scsi_remove_target+0x16b/0x220 [] __iscsi_unbind_session+0xd1/0x1b0 [] iscsi_remove_session+0xe2/0x1c0 [] iscsi_destroy_session+0x16/0x60 [] iscsi_session_teardown+0xd9/0x100 [] iscsi_sw_tcp_session_destroy+0x5a/0xb0 [] iscsi_if_rx+0x10e8/0x1560 [] netlink_unicast+0x145/0x200 [] netlink_sendmsg+0x303/0x410 [] sock_sendmsg+0xa6/0xd0 [] ___sys_sendmsg+0x38c/0x3a0 [] ? fget_light+0x40/0x160 [] ? fget_light+0x99/0x160 [] ? fget_light+0x40/0x160 [] __sys_sendmsg+0x49/0x90 [] SyS_sendmsg+0x12/0x20 [] system_call_fastpath+0x16/0x1b Code: 66 66 66 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 4c 8d 67 38 53 48 83 ec 18 89 55 c4 48 8b 57 38 4c 89 45 c8 <4c> 8b 2a 48 8d 42 e8 49 Fix it by moving r->root_blkg and q->root_rl.blkg setting to blkg_create() and clearing to blkg_destroy() so that they area initialized when a root blkg is created and cleared when destroyed. Reported-and-tested-by: Anatol Pomozov Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 290792a13e3c..db30b6beee72 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -235,8 +235,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) + if (!ret) { + if (blkcg == &blkcg_root) { + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + } return blkg; + } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); @@ -334,6 +339,15 @@ static void blkg_destroy(struct blkcg_gq *blkg) if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); + /* + * If root blkg is destroyed. Just clear the pointer since root_rl + * does not take reference on root blkg. + */ + if (blkcg == &blkcg_root) { + blkg->q->root_blkg = NULL; + blkg->q->root_rl.blkg = NULL; + } + /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. @@ -360,13 +374,6 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } - - /* - * root blkg is destroyed. Just clear the pointer since - * root_rl does not take reference on root blkg. - */ - q->root_blkg = NULL; - q->root_rl.blkg = NULL; } /* @@ -973,8 +980,6 @@ int blkcg_activate_policy(struct request_queue *q, ret = PTR_ERR(blkg); goto out_unlock; } - q->root_blkg = blkg; - q->root_rl.blkg = blkg; list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; -- cgit v1.2.3 From ed751e683c563be64322b9bfa0f0f7e5da9bd37c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:08 -0700 Subject: block/blk-sysfs.c: replace strict_strtoul() with kstrtoul() The usage of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5efc5a647183..3aa5b195f4dd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -29,7 +29,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count) int err; unsigned long v; - err = strict_strtoul(page, 10, &v); + err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; -- cgit v1.2.3 From bab55417b10c95e6bff8cea315c315adfa009487 Mon Sep 17 00:00:00 2001 From: Cai Zhiyong Date: Wed, 11 Sep 2013 14:20:09 -0700 Subject: block: support embedded device command line partition Read block device partition table from command line. The partition used for fixed block device (eMMC) embedded device. It is no MBR, save storage space. Bootloader can be easily accessed by absolute address of data on the block device. Users can easily change the partition. This code reference MTD partition, source "drivers/mtd/cmdlinepart.c" About the partition verbose reference "Documentation/block/cmdline-partition.txt" [akpm@linux-foundation.org: fix printk text] [yongjun_wei@trendmicro.com.cn: fix error return code in parse_parts()] Signed-off-by: Cai Zhiyong Cc: Karel Zak Cc: "Wanglin (Albert)" Cc: Marius Groeger Cc: David Woodhouse Cc: Jens Axboe Cc: Brian Norris Cc: Artem Bityutskiy Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/Kconfig | 6 ++ block/Makefile | 1 + block/cmdline-parser.c | 250 +++++++++++++++++++++++++++++++++++++++++++++ block/partitions/Kconfig | 7 ++ block/partitions/Makefile | 1 + block/partitions/check.c | 4 + block/partitions/cmdline.c | 99 ++++++++++++++++++ block/partitions/cmdline.h | 2 + 8 files changed, 370 insertions(+) create mode 100644 block/cmdline-parser.c create mode 100644 block/partitions/cmdline.c create mode 100644 block/partitions/cmdline.h (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index a7e40a7c8214..7f38e40fee08 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config CMDLINE_PARSER + bool "Block device command line partition parser" + default n + ---help--- + Parsing command line, get the partitions information. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 39b76ba66ffd..4fa4be544ece 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c new file mode 100644 index 000000000000..cc2637f8674e --- /dev/null +++ b/block/cmdline-parser.c @@ -0,0 +1,250 @@ +/* + * Parse command line, get partition information + * + * Written by Cai Zhiyong + * + */ +#include +#include +#include + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + +/* + * add_part() + * 0 success. + * 1 can not add so many partitions. + */ +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param) + +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, param)) + break; + } +} diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 4cebb2f0d2f4..87a32086535d 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -260,3 +260,10 @@ config SYSV68_PARTITION partition table format used by Motorola Delta machines (using sysv68). Otherwise, say N. + +config CMDLINE_PARTITION + bool "Command line partition support" if PARTITION_ADVANCED + select CMDLINE_PARSER + help + Say Y here if you would read the partitions table from bootargs. + The format for the command line is just like mtdparts. diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2be4d7ba4e3a..37a95270503c 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o obj-$(CONFIG_AIX_PARTITION) += aix.o +obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/check.c b/block/partitions/check.c index 19ba207ea7d1..9ac1df74f699 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -34,6 +34,7 @@ #include "efi.h" #include "karma.h" #include "sysv68.h" +#include "cmdline.h" int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ @@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = { adfspart_check_ADFS, #endif +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c new file mode 100644 index 000000000000..56cf4ffad51e --- /dev/null +++ b/block/partitions/cmdline.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013 HUAWEI + * Author: Cai Zhiyong + * + * Read block device partition table from command line. + * The partition used for fixed block device (eMMC) embedded device. + * It is no MBR, save storage space. Bootloader can be easily accessed + * by absolute address of data on the block device. + * Users can easily change the partition. + * + * The format for the command line is just like mtdparts. + * + * Verbose config please reference "Documentation/block/cmdline-partition.txt" + * + */ + +#include + +#include "check.h" +#include "cmdline.h" + +static char *cmdline; +static struct cmdline_parts *bdev_parts; + +static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +{ + int label_min; + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + struct parsed_partitions *state = (struct parsed_partitions *)param; + + if (slot >= state->limit) + return 1; + + put_partition(state, slot, subpart->from >> 9, + subpart->size >> 9); + + info = &state->parts[slot].info; + + label_min = min_t(int, sizeof(info->volname) - 1, + sizeof(subpart->name)); + strncpy(info->volname, subpart->name, label_min); + info->volname[label_min] = '\0'; + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + state->parts[slot].has_info = true; + + return 0; +} + +static int __init cmdline_parts_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_parts_setup); + +/* + * Purpose: allocate cmdline partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + */ +int cmdline_partition(struct parsed_partitions *state) +{ + sector_t disk_size; + char bdev[BDEVNAME_SIZE]; + struct cmdline_parts *parts; + + if (cmdline) { + if (bdev_parts) + cmdline_parts_free(&bdev_parts); + + if (cmdline_parts_parse(&bdev_parts, cmdline)) { + cmdline = NULL; + return -1; + } + cmdline = NULL; + } + + if (!bdev_parts) + return 0; + + bdevname(state->bdev, bdev); + parts = cmdline_parts_find(bdev_parts, bdev); + if (!parts) + return 0; + + disk_size = get_capacity(state->bdev->bd_disk) << 9; + + cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h new file mode 100644 index 000000000000..26e0f8da1414 --- /dev/null +++ b/block/partitions/cmdline.h @@ -0,0 +1,2 @@ + +int cmdline_partition(struct parsed_partitions *state); -- cgit v1.2.3 From 3ddc5b46a8e90f3c9251338b60191d0a804b0d92 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Sep 2013 14:23:18 -0700 Subject: kernel-wide: fix missing validations on __get/__put/__copy_to/__copy_from_user() I found the following pattern that leads in to interesting findings: grep -r "ret.*|=.*__put_user" * grep -r "ret.*|=.*__get_user" * grep -r "ret.*|=.*__copy" * The __put_user() calls in compat_ioctl.c, ptrace compat, signal compat, since those appear in compat code, we could probably expect the kernel addresses not to be reachable in the lower 32-bit range, so I think they might not be exploitable. For the "__get_user" cases, I don't think those are exploitable: the worse that can happen is that the kernel will copy kernel memory into in-kernel buffers, and will fail immediately afterward. The alpha csum_partial_copy_from_user() seems to be missing the access_ok() check entirely. The fix is inspired from x86. This could lead to information leak on alpha. I also noticed that many architectures map csum_partial_copy_from_user() to csum_partial_copy_generic(), but I wonder if the latter is performing the access checks on every architectures. Signed-off-by: Mathieu Desnoyers Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Jens Axboe Cc: Oleg Nesterov Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7e5d474dc6ba..fbd5a67cb773 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, return ret; ret = copy_to_user(ugeo, &geo, 4); - ret |= __put_user(geo.start, &ugeo->start); + ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; -- cgit v1.2.3 From c2ebdc2439f50c049fd362bb225aaf78fe8e4cb8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:55 -0700 Subject: partitions/efi: use lba-aware partition records The kernel's GPT implementation currently uses the generic 'struct partition' type for dealing with legacy MBR partition records. While this is is useful for disklabels that we designed for CHS addressing, such as msdos, it doesn't adapt well to newer standards that use LBA instead, such as GUID partition tables. Furthermore, these generic partition structures do not have all the required fields to properly follow the UEFI specs. While a CHS address can be translated to LBA, it's much simpler and cleaner to just replace the partition type. This patch adds a new 'gpt_record' type that is fully compliant with EFI and will allow, in the next patches, to add more checks to properly verify a protective MBR, which is paramount to probing a device that makes use of GPT. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 9 ++++----- block/partitions/efi.h | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index c85fc895ecdb..bd8fb22b2109 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -149,12 +149,11 @@ static u64 last_lba(struct block_device *bdev) bdev_logical_block_size(bdev)) - 1ULL; } -static inline int -pmbr_part_valid(struct partition *part) +static inline int pmbr_part_valid(gpt_mbr_record *part) { - if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sect) == 1UL) - return 1; + if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT && + le32_to_cpu(part->start_sector) == 1UL) + return 1; return 0; } diff --git a/block/partitions/efi.h b/block/partitions/efi.h index b69ab729558f..e645ecb35bf3 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -101,11 +101,25 @@ typedef struct _gpt_entry { efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; } __attribute__ ((packed)) gpt_entry; +typedef struct _gpt_mbr_record { + u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */ + u8 start_head; /* unused by EFI, pt start in CHS */ + u8 start_sector; /* unused by EFI, pt start in CHS */ + u8 start_track; + u8 os_type; /* EFI and legacy non-EFI OS types */ + u8 end_head; /* unused by EFI, pt end in CHS */ + u8 end_sector; /* unused by EFI, pt end in CHS */ + u8 end_track; /* unused by EFI, pt end in CHS */ + __le32 starting_lba; /* used by EFI - start addr of the on disk pt */ + __le32 size_in_lba; /* used by EFI - size of pt in LBA */ +} __packed gpt_mbr_record; + + typedef struct _legacy_mbr { u8 boot_code[440]; __le32 unique_mbr_signature; __le16 unknown; - struct partition partition_record[4]; + gpt_mbr_record partition_record[4]; __le16 signature; } __attribute__ ((packed)) legacy_mbr; -- cgit v1.2.3 From 33afd7a7df1a1f82675857a75572cdf4a8599e9f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:56 -0700 Subject: partitions/efi: check pmbr record's starting lba Per the UEFI Specs 2.4, June 2013, the starting lba of the partition that has the EFI GPT (0xEE) must be set to 0x00000001 - this is obviously the LBA of the GPT Partition Header. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bd8fb22b2109..7a2b74f0d06f 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -151,10 +151,19 @@ static u64 last_lba(struct block_device *bdev) static inline int pmbr_part_valid(gpt_mbr_record *part) { - if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sector) == 1UL) - return 1; - return 0; + if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) + goto invalid; + + /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ + if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) + goto invalid; + + if (le32_to_cpu(part->start_sector) != 1UL) + goto invalid; + + return 1; +invalid: + return 0; } /** -- cgit v1.2.3 From 3e69ac344007bec5e3987ac86619e140fbc79b72 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:57 -0700 Subject: partitions/efi: do not require gpt partition to begin at sector 1 When detecting a valid protective MBR, the Linux kernel isn't picky about the partition (1-4) the 0xEE is at, but, unlike other operating systems, it does require it to begin at the second sector (sector 1). This check, apart from it not being enforced by UEFI, and causing Linux to potentially fail to detect any *valid* partitions on the disk, can present problems when dealing with hybrid MBRs[1]. For compatibility reasons, if the first partition is hybridized, the 0xEE partition must be small enough to ensure that it only protects the GPT data structures - as opposed to the the whole disk in a protective MBR. This problem is very well described by Rod Smith[1]: where MBR-only partitioning programs (such as older versions of fdisk) can see some of the disk space as unallocated, thus loosing the purpose of the 0xEE partition's protection of GPT data structures. By dropping this check, this patch enables Linux to be more flexible when probing for GPT disklabels. [1] http://www.rodsbooks.com/gdisk/hybrid.html#reactions Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7a2b74f0d06f..1b499dc8fc78 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -158,9 +158,6 @@ static inline int pmbr_part_valid(gpt_mbr_record *part) if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; - if (le32_to_cpu(part->start_sector) != 1UL) - goto invalid; - return 1; invalid: return 0; -- cgit v1.2.3 From b05ebbbbeb67a420d06567c6b9618a9e644d6104 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:58 -0700 Subject: partitions/efi: detect hybrid MBRs One of the biggest problems with GPT is compatibility with older, non-GPT systems. The problem is addressed by creating hybrid mbrs, an extension, or variant, of the traditional protective mbr. This contains, apart from the 0xEE partition, up three additional primary partitions that point to the same space marked by up to three GPT partitions. The result is that legacy OSs can see the three required MBR partitions and at the same time ignore the GPT-aware partitions that protect the GPT structures. While hybrid MBRs are hacks, workarounds and simply not part of the GPT standard, they do exist and we have no way around them. For instance, by default, OSX creates a hybrid scheme when using multi-OS booting. In order for Linux to properly discover protective MBRs, it must be made aware of devices that have hybrid MBRs. No functionality is changed by this patch, just a debug message informing the user of the MBR scheme that is being used. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 74 ++++++++++++++++++++++++++++++++++++-------------- block/partitions/efi.h | 3 ++ 2 files changed, 56 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1b499dc8fc78..e3cb4f19cf6d 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -158,7 +158,7 @@ static inline int pmbr_part_valid(gpt_mbr_record *part) if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; - return 1; + return GPT_MBR_PROTECTIVE; invalid: return 0; } @@ -167,21 +167,48 @@ invalid: * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure * - * Description: Returns 1 if PMBR is valid, 0 otherwise. - * Validity depends on two things: + * Description: Checks for a valid protective or hybrid + * master boot record (MBR). The validity of a pMBR depends + * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found + * + * In addition, a hybrid MBR will have up to three additional + * primary partitions, which point to the same space that's + * marked out by up to three GPT partitions. + * + * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or + * GPT_MBR_HYBRID depending on the device layout. */ -static int -is_pmbr_valid(legacy_mbr *mbr) +static int is_pmbr_valid(legacy_mbr *mbr) { - int i; + int i, ret = 0; /* invalid by default */ + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) - return 0; + goto done; + + for (i = 0; i < 4; i++) { + ret = pmbr_part_valid(&mbr->partition_record[i]); + if (ret == GPT_MBR_PROTECTIVE) { + /* + * Ok, we at least know that there's a protective MBR, + * now check if there are other partition types for + * hybrid MBR. + */ + goto check_hybrid; + } + } + + if (ret != GPT_MBR_PROTECTIVE) + goto done; +check_hybrid: for (i = 0; i < 4; i++) - if (pmbr_part_valid(&mbr->partition_record[i])) - return 1; - return 0; + if ((mbr->partition_record[i].os_type != + EFI_PMBR_OSTYPE_EFI_GPT) && + (mbr->partition_record[i].os_type != 0x00)) + ret = GPT_MBR_HYBRID; +done: + return ret; } /** @@ -548,17 +575,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, lastlba = last_lba(state->bdev); if (!force_gpt) { - /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); - if (legacymbr) { - read_lba(state, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); - kfree(legacymbr); - } - if (!good_pmbr) - goto fail; - } + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); + if (!legacymbr) + goto fail; + + read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr); + kfree(legacymbr); + + if (!good_pmbr) + goto fail; + + pr_debug("Device has a %s MBR\n", + good_pmbr == GPT_MBR_PROTECTIVE ? + "protective" : "hybrid"); + } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index e645ecb35bf3..7fef625c04de 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -37,6 +37,9 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE +#define GPT_MBR_PROTECTIVE 1 +#define GPT_MBR_HYBRID 2 + #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 -- cgit v1.2.3 From 27a7c642174eaec627f6a3a254035bf8abd02c5e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:00 -0700 Subject: partitions/efi: account for pmbr size in lba The partition that has the 0xEE (GPT protective), must have the size in lba field set to the lesser of the size of the disk minus one or 0xFFFFFFFF for larger disks. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e3cb4f19cf6d..b028af688361 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -166,6 +166,7 @@ invalid: /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure + * @total_sectors: amount of sectors in the device * * Description: Checks for a valid protective or hybrid * master boot record (MBR). The validity of a pMBR depends @@ -180,9 +181,9 @@ invalid: * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or * GPT_MBR_HYBRID depending on the device layout. */ -static int is_pmbr_valid(legacy_mbr *mbr) +static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { - int i, ret = 0; /* invalid by default */ + int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) goto done; @@ -190,6 +191,7 @@ static int is_pmbr_valid(legacy_mbr *mbr) for (i = 0; i < 4; i++) { ret = pmbr_part_valid(&mbr->partition_record[i]); if (ret == GPT_MBR_PROTECTIVE) { + part = i; /* * Ok, we at least know that there's a protective MBR, * now check if there are other partition types for @@ -207,6 +209,18 @@ check_hybrid: EFI_PMBR_OSTYPE_EFI_GPT) && (mbr->partition_record[i].os_type != 0x00)) ret = GPT_MBR_HYBRID; + + /* + * Protective MBRs take up the lesser of the whole disk + * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * + * Hybrid MBRs do not necessarily comply with this. + */ + if (ret == GPT_MBR_PROTECTIVE) { + if (le32_to_cpu(mbr->partition_record[part].size_in_lba) != + min((uint32_t) total_sectors - 1, 0xFFFFFFFF)) + ret = 0; + } done: return ret; } @@ -568,6 +582,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; u64 lastlba; if (!ptes) @@ -581,7 +596,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, goto fail; read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); + good_pmbr = is_pmbr_valid(legacymbr, total_sectors); kfree(legacymbr); if (!good_pmbr) -- cgit v1.2.3 From aa054bc93743ecce3a27f1655d59674dabc71a54 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:01 -0700 Subject: partitions/efi: compare first and last usable LBAs When verifying GPT header integrity, make sure that first usable LBA is smaller than last usable LBA. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index b028af688361..de9f9bfa24bc 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -410,7 +410,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, (unsigned long long)lastlba); goto fail; } - + if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); + goto fail; + } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partitition Entry Size check failed.\n"); -- cgit v1.2.3 From 08009b30a71d9a7c252c4bd677dbd496af9dd1a2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:02 -0700 Subject: partitions/efi: delete annoying emacs style comments I love emacs, but these settings for coding style are annoying when trying to open the efi.h file. More important, we already have checkpatch for that. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 7fef625c04de..4efcafba7e64 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -130,22 +130,3 @@ typedef struct _legacy_mbr { extern int efi_partition(struct parsed_partitions *state); #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * -------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ -- cgit v1.2.3 From 70f637e90ea96187530365eb1ddff8d483ba460e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:03 -0700 Subject: partitions/efi: some style cleanups Trivial coding style cleanups - still plenty left. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index de9f9bfa24bc..0df535fac0aa 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -25,6 +25,9 @@ * TODO: * * Changelog: + * Mon August 5th, 2013 Davidlohr Bueso + * - detect hybrid MBRs, tighter pMBR checking & cleanups. + * * Mon Nov 09 2004 Matt Domsch * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. @@ -289,8 +292,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), - (u8 *) pte, - count) < count) { + (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; @@ -633,11 +635,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = pptes; kfree(agpt); kfree(aptes); - if (!good_agpt) { - printk(KERN_WARNING - "Alternate GPT is invalid, " - "using primary GPT.\n"); - } + if (!good_agpt) + printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { @@ -645,8 +644,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = aptes; kfree(pgpt); kfree(pptes); - printk(KERN_WARNING - "Primary GPT is invalid, using alternate GPT.\n"); + printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n"); return 1; } @@ -708,8 +706,7 @@ int efi_partition(struct parsed_partitions *state) put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ - if (!efi_guidcmp(ptes[i].partition_type_guid, - PARTITION_LINUX_RAID_GUID)) + if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; -- cgit v1.2.3 From b4bc4a18a226f46fec4ef47f2df28ea209db8b5d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:25:04 -0700 Subject: block/partitions/efi.c: consistently use pr_foo() Cc: Davidlohr Bueso Cc: Karel Zak Cc: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 0df535fac0aa..1a5ec9a03c00 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -482,44 +482,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { - printk(KERN_WARNING - "GPT:Primary header LBA != Alt. header alternate_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { - printk(KERN_WARNING - "GPT:Primary header alternate_lba != Alt. header my_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { - printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:first_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { - printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:last_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { - printk(KERN_WARNING "GPT:disk_guids don't match.\n"); + pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { - printk(KERN_WARNING "GPT:num_partition_entries don't match: " + pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); @@ -527,8 +525,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { - printk(KERN_WARNING - "GPT:sizeof_partition_entry values don't match: " + pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); @@ -536,34 +533,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { - printk(KERN_WARNING - "GPT:partition_entry_array_crc32 values don't match: " + pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Alternate GPT header not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) - printk(KERN_WARNING - "GPT: Use GNU Parted to correct GPT errors.\n"); + pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } @@ -636,7 +629,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, kfree(agpt); kfree(aptes); if (!good_agpt) - printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n"); + pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { @@ -644,7 +637,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = aptes; kfree(pgpt); kfree(pptes); - printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n"); + pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } -- cgit v1.2.3 From 5e4c0d974139a98741b829b27cf38dc8f9284490 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:26:05 -0700 Subject: lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is one such possible user), the following race can happen: radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; ... radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; And we give out one radix tree node twice. That clearly results in radix tree corruption with different results (usually OOPS) depending on which two users of radix tree race. We fix the problem by making radix_tree_node_alloc() always allocate fresh radix tree nodes when in interrupt. Using preloading when in interrupt doesn't make sense since all the allocations have to be atomic anyway and we cannot steal nodes from process-context users because some users rely on radix_tree_insert() succeeding after radix_tree_preload(). in_interrupt() check is somewhat ugly but we cannot simply key off passed gfp_mask as that is acquired from root_gfp_mask() and thus the same for all preload users. Another part of the fix is to avoid node preallocation in radix_tree_preload() when passed gfp_mask doesn't allow waiting. Again, preallocation in such case doesn't make sense and when preallocation would happen in interrupt we could possibly leak some allocated nodes. However, some users of radix_tree_preload() require following radix_tree_insert() to succeed. To avoid unexpected effects for these users, radix_tree_preload() only warns if passed gfp mask doesn't allow waiting and we provide a new function radix_tree_maybe_preload() for those users which get different gfp mask from different call sites and which are prepared to handle radix_tree_insert() failure. Signed-off-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-ioc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 4464c823cff2..46cd7bd18b34 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (!icq) return NULL; - if (radix_tree_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } -- cgit v1.2.3 From 6b02fa59a7cf34c548eedee657b07ea6c54d3894 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 Sep 2013 15:02:22 -0700 Subject: partitions/efi: loosen check fot pmbr size in lba Matt found that commit 27a7c642174e ("partitions/efi: account for pmbr size in lba") caused his GPT formatted eMMC device not to boot. The reason is that this commit enforced Linux to always check the lesser of the whole disk or 2Tib for the pMBR size in LBA. While most disk partitioning tools out there create a pMBR with these characteristics, Microsoft does not, as it always sets the entry to the maximum 32-bit limitation - even though a drive may be smaller than that[1]. Loosen this check and only verify that the size is either the whole disk or 0xFFFFFFFF. No tool in its right mind would set it to any value other than these. [1] http://thestarman.pcministry.com/asm/mbr/GPT.htm#GPTPT Reported-and-tested-by: Matt Porter Signed-off-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1a5ec9a03c00..1eb09ee5311b 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -186,6 +186,7 @@ invalid: */ static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { + uint32_t sz = 0; int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) @@ -216,12 +217,15 @@ check_hybrid: /* * Protective MBRs take up the lesser of the whole disk * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * Some partitioning programs, nonetheless, choose to set + * the size to the maximum 32-bit limitation, disregarding + * the disk size. * * Hybrid MBRs do not necessarily comply with this. */ if (ret == GPT_MBR_PROTECTIVE) { - if (le32_to_cpu(mbr->partition_record[part].size_in_lba) != - min((uint32_t) total_sectors - 1, 0xFFFFFFFF)) + sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); + if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) ret = 0; } done: -- cgit v1.2.3 From 7652113c2f508b1c8176640dcd034730fe79bc48 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 18 Sep 2013 08:33:55 -0600 Subject: If the queue is dying then we only call the rq->end_io callout. This leaves bios setup on the request, because the caller assumes when the blk_execute_rq_nowait/blk_execute_rq call has completed that the rq->bios have been cleaned up. This patch has blk_execute_rq_nowait use __blk_end_request_all to free bios and also call rq->end_io. Signed-off-by: Mike Christie Signed-off-by: Jens Axboe --- block/blk-exec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index e70621396129..ae4f27d7944e 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -68,9 +68,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { + rq->cmd_flags |= REQ_QUIET; rq->errors = -ENXIO; - if (rq->end_io) - rq->end_io(rq, rq->errors); + __blk_end_request_all(rq, rq->errors); spin_unlock_irq(q->queue_lock); return; } -- cgit v1.2.3 From f3cff25f05f2ac29b2ee355e611b0657482f6f1d Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Sun, 22 Sep 2013 12:43:47 -0600 Subject: cfq: explicitly use 64bit divide operation for 64bit arguments 'samples' is 64bit operant, but do_div() second parameter is 32. do_div silently truncates high 32 bits and calculated result is invalid. In case if low 32bit of 'samples' are zeros then do_div() produces kernel crash. Signed-off-by: Anatol Pomozov Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f0468e252ee4..51e06ea06a2e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1803,7 +1803,7 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, if (samples) { v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); - do_div(v, samples); + v = div64_u64(v, samples); } __blkg_prfill_u64(sf, pd, v); return 0; -- cgit v1.2.3 From 080506ad0aa9c9fbc7879cdd290d55624da08c60 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 30 Sep 2013 13:45:19 -0700 Subject: block: change config option name for cmdline partition parsing Recently commit bab55417b10c ("block: support embedded device command line partition") introduced CONFIG_CMDLINE_PARSER. However, that name is too generic and sounds like it enables/disables generic kernel boot arg processing, when it really is block specific. Before this option becomes a part of a full/final release, add the BLK_ prefix to it so that it is clear in absence of any other context that it is block specific. In addition, fix up the following less critical items: - help text was not really at all helpful. - index file for Documentation was not updated - add the new arg to Documentation/kernel-parameters.txt - clarify wording in source comments Signed-off-by: Paul Gortmaker Cc: Jens Axboe Cc: Cai Zhiyong Cc: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/Kconfig | 9 +++++++-- block/Makefile | 2 +- block/partitions/Kconfig | 4 ++-- block/partitions/cmdline.c | 8 ++++---- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 7f38e40fee08..2429515c05c2 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -99,11 +99,16 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. -config CMDLINE_PARSER +config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n ---help--- - Parsing command line, get the partitions information. + Enabling this option allows you to specify the partition layout from + the kernel boot args. This is typically of use for embedded devices + which don't otherwise have any standardized method for listing the + partitions on a block device. + + See Documentation/block/cmdline-partition.txt for more information. menu "Partition Types" diff --git a/block/Makefile b/block/Makefile index 4fa4be544ece..671a83d063a5 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,4 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o -obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 87a32086535d..9b29a996c311 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -263,7 +263,7 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select CMDLINE_PARSER + select BLK_CMDLINE_PARSER help - Say Y here if you would read the partitions table from bootargs. + Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 56cf4ffad51e..5141b563adf1 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -2,15 +2,15 @@ * Copyright (C) 2013 HUAWEI * Author: Cai Zhiyong * - * Read block device partition table from command line. - * The partition used for fixed block device (eMMC) embedded device. - * It is no MBR, save storage space. Bootloader can be easily accessed + * Read block device partition table from the command line. + * Typically used for fixed block (eMMC) embedded devices. + * It has no MBR, so saves storage space. Bootloader can be easily accessed * by absolute address of data on the block device. * Users can easily change the partition. * * The format for the command line is just like mtdparts. * - * Verbose config please reference "Documentation/block/cmdline-partition.txt" + * For further information, see "Documentation/block/cmdline-partition.txt" * */ -- cgit v1.2.3