diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 4 | ||||
-rw-r--r-- | block/blk-core.c | 40 | ||||
-rw-r--r-- | block/blk-ioc.c | 5 | ||||
-rw-r--r-- | block/blk-merge.c | 3 | ||||
-rw-r--r-- | block/blk-throttle.c | 10 | ||||
-rw-r--r-- | block/cfq-iosched.c | 140 | ||||
-rw-r--r-- | block/genhd.c | 550 | ||||
-rw-r--r-- | block/ioctl.c | 5 |
9 files changed, 650 insertions, 109 deletions
diff --git a/block/Kconfig b/block/Kconfig index 6c9213ef15a1..60be1e0455da 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -2,7 +2,7 @@ # Block layer core configuration # menuconfig BLOCK - bool "Enable the block layer" if EMBEDDED + bool "Enable the block layer" if EXPERT default y help Provide block layer support for the kernel. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b1febd0f6d2a..455768a3eb9e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) goto done; } - /* Currently we do not support hierarchy deeper than two level (0,1) */ - if (parent != cgroup->top_cgroup) - return ERR_PTR(-EPERM); - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) return ERR_PTR(-ENOMEM); diff --git a/block/blk-core.c b/block/blk-core.c index ab4a7696956d..3cc17e6064d6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,7 +33,7 @@ #include "blk.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -1342,9 +1357,9 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_sector - p->start_sect); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, + bio->bi_sector - p->start_sect); } } @@ -1513,7 +1528,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, old_sector); + trace_block_bio_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; @@ -1789,7 +1804,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1809,13 +1824,14 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); + hd_struct_put(part); part_stat_unlock(); } } @@ -2619,7 +2635,9 @@ int __init blk_dev_init(void) BUILD_BUG_ON(__REQ_NR_BITS > 8 * sizeof(((struct request *)0)->cmd_flags)); - kblockd_workqueue = create_workqueue("kblockd"); + /* used for unplugging and affects IO latency/throughput - HIGHPRI */ + kblockd_workqueue = alloc_workqueue("kblockd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 3c7a339fe381..b791022beef3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc) rcu_read_unlock(); } -/* Called by the exitting task */ +/* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; @@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - if (atomic_dec_and_test(&ioc->nr_tasks)) { + if (atomic_dec_and_test(&ioc->nr_tasks)) cfq_exit(ioc); - } put_io_context(ioc); } diff --git a/block/blk-merge.c b/block/blk-merge.c index 74bc4a768f32..ea85e20d5e94 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); + hd_struct_put(part); part_stat_unlock(); } } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 381b09bb562b..a89043a3caa4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -168,7 +168,15 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, * tree of blkg (instead of traversing through hash list all * the time. */ - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + tg = &td->root_tg; + else + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); /* Fill in device details for root group */ if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 968455c57e1a..f27ff3efe6cd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,7 +87,6 @@ struct cfq_rb_root { unsigned count; unsigned total_weight; u64 min_vdisktime; - struct rb_node *active; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ .count = 0, .min_vdisktime = 0, } @@ -97,7 +96,7 @@ struct cfq_rb_root { */ struct cfq_queue { /* reference count */ - atomic_t ref; + int ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ @@ -180,7 +179,6 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; unsigned int weight; - bool on_st; /* number of cfqq currently on this group */ int nr_cfqq; @@ -209,7 +207,7 @@ struct cfq_group { struct blkio_group blkg; #ifdef CONFIG_CFQ_GROUP_IOSCHED struct hlist_node cfqd_node; - atomic_t ref; + int ref; #endif /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st) u64 vdisktime = st->min_vdisktime; struct cfq_group *cfqg; - if (st->active) { - cfqg = rb_entry_cfqg(st->active); - vdisktime = cfqg->vdisktime; - } - if (st->left) { cfqg = rb_entry_cfqg(st->left); vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); @@ -605,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) return cfq_target_latency * cfqg->weight / st->total_weight; } -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline unsigned +cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned slice = cfq_prio_to_slice(cfqd, cfqq); if (cfqd->cfq_latency) { @@ -632,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) low_slice); } } + return slice; +} + +static inline void +cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq); + cfqq->slice_start = jiffies; cfqq->slice_end = jiffies + slice; cfqq->allocated_slice = slice; @@ -646,11 +647,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) static inline bool cfq_slice_used(struct cfq_queue *cfqq) { if (cfq_cfqq_slice_new(cfqq)) - return 0; + return false; if (time_before(jiffies, cfqq->slice_end)) - return 0; + return false; - return 1; + return true; } /* @@ -869,7 +870,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) struct rb_node *n; cfqg->nr_cfqq++; - if (cfqg->on_st) + if (!RB_EMPTY_NODE(&cfqg->rb_node)) return; /* @@ -885,7 +886,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqg->vdisktime = st->min_vdisktime; __cfq_group_service_tree_add(st, cfqg); - cfqg->on_st = true; st->total_weight += cfqg->weight; } @@ -894,9 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - if (st->active == &cfqg->rb_node) - st->active = NULL; - BUG_ON(cfqg->nr_cfqq < 1); cfqg->nr_cfqq--; @@ -905,7 +902,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfqg->on_st = false; st->total_weight -= cfqg->weight; if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); @@ -1026,11 +1022,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) * elevator which will be dropped by either elevator exit * or cgroup deletion path depending on who is exiting first. */ - atomic_set(&cfqg->ref, 1); + cfqg->ref = 1; /* * Add group onto cgroup list. It might happen that bdi->dev is - * not initiliazed yet. Initialize this new group without major + * not initialized yet. Initialize this new group without major * and minor info and this info will be filled in once a new thread * comes for IO. See code above. */ @@ -1071,7 +1067,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) { - atomic_inc(&cfqg->ref); + cfqg->ref++; return cfqg; } @@ -1083,7 +1079,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - atomic_inc(&cfqq->cfqg->ref); + cfqq->cfqg->ref++; } static void cfq_put_cfqg(struct cfq_group *cfqg) @@ -1091,11 +1087,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) struct cfq_rb_root *st; int i, j; - BUG_ON(atomic_read(&cfqg->ref) <= 0); - if (!atomic_dec_and_test(&cfqg->ref)) + BUG_ON(cfqg->ref <= 0); + cfqg->ref--; + if (cfqg->ref) return; for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); kfree(cfqg); } @@ -1200,7 +1197,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_group_service_tree_del(cfqd, cfqq->cfqg); cfqq->orig_cfqg = cfqq->cfqg; cfqq->cfqg = &cfqd->root_group; - atomic_inc(&cfqd->root_group.ref); + cfqd->root_group.ref++; group_changed = 1; } else if (!cfqd->cfq_group_isolation && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { @@ -1672,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * store what was left of this slice, if the queue idled/timed out */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) { - cfqq->slice_resid = cfqq->slice_end - jiffies; + if (timed_out) { + if (cfq_cfqq_slice_new(cfqq)) + cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); + else + cfqq->slice_resid = cfqq->slice_end - jiffies; cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } @@ -1687,9 +1687,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) - cfqd->grp_service_tree.active = NULL; - if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -1901,10 +1898,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * in their service tree. */ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) - return 1; + return true; cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", service_tree->count); - return 0; + return false; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -2040,7 +2037,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq) int process_refs, io_refs; io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; - process_refs = atomic_read(&cfqq->ref) - io_refs; + process_refs = cfqq->ref - io_refs; BUG_ON(process_refs < 0); return process_refs; } @@ -2080,10 +2077,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) */ if (new_process_refs >= process_refs) { cfqq->new_cfqq = new_cfqq; - atomic_add(process_refs, &new_cfqq->ref); + new_cfqq->ref += process_refs; } else { new_cfqq->new_cfqq = cfqq; - atomic_add(new_process_refs, &cfqq->ref); + cfqq->ref += new_process_refs; } } @@ -2116,12 +2113,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) unsigned count; struct cfq_rb_root *st; unsigned group_slice; - - if (!cfqg) { - cfqd->serving_prio = IDLE_WORKLOAD; - cfqd->workload_expires = jiffies + 1; - return; - } + enum wl_prio_t original_prio = cfqd->serving_prio; /* Choose next priority. RT > BE > IDLE */ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) @@ -2134,6 +2126,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) return; } + if (original_prio != cfqd->serving_prio) + goto new_workload; + /* * For RT and BE, we have to choose also the type * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload @@ -2148,6 +2143,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) if (count && !time_after(jiffies, cfqd->workload_expires)) return; +new_workload: /* otherwise select new workload type */ cfqd->serving_type = cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); @@ -2199,7 +2195,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) if (RB_EMPTY_ROOT(&st->rb)) return NULL; cfqg = cfq_rb_first_group(st); - st->active = &cfqg->rb_node; update_min_vdisktime(st); return cfqg; } @@ -2293,6 +2288,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto keep_queue; } + /* + * This is a deep seek queue, but the device is much faster than + * the queue can deliver, don't idle + **/ + if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && + (cfq_cfqq_slice_new(cfqq) || + (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { + cfq_clear_cfqq_deep(cfqq); + cfq_clear_cfqq_idle_window(cfqq); + } + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { cfqq = NULL; goto keep_queue; @@ -2367,12 +2373,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, { /* the queue hasn't finished any request, can't estimate */ if (cfq_cfqq_slice_new(cfqq)) - return 1; + return true; if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, cfqq->slice_end)) - return 1; + return true; - return 0; + return false; } static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) @@ -2538,9 +2544,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq) struct cfq_data *cfqd = cfqq->cfqd; struct cfq_group *cfqg, *orig_cfqg; - BUG_ON(atomic_read(&cfqq->ref) <= 0); + BUG_ON(cfqq->ref <= 0); - if (!atomic_dec_and_test(&cfqq->ref)) + cfqq->ref--; + if (cfqq->ref) return; cfq_log_cfqq(cfqd, cfqq, "put_queue"); @@ -2843,7 +2850,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); - atomic_set(&cfqq->ref, 0); + cfqq->ref = 0; cfqq->cfqd = cfqd; cfq_mark_cfqq_prio_changed(cfqq); @@ -2979,11 +2986,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, * pin the queue now that it's allocated, scheduler exit will prune it */ if (!is_sync && !(*async_cfqq)) { - atomic_inc(&cfqq->ref); + cfqq->ref++; *async_cfqq = cfqq; } - atomic_inc(&cfqq->ref); + cfqq->ref++; return cfqq; } @@ -3265,6 +3272,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) return true; + /* An idle queue should not be idle now for some reason */ + if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) + return true; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) return false; @@ -3284,10 +3295,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_queue *old_cfqq = cfqd->active_queue; + cfq_log_cfqq(cfqd, cfqq, "preempt"); cfq_slice_expired(cfqd, 1); /* + * workload type is changed, don't save slice, otherwise preempt + * doesn't happen + */ + if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) + cfqq->cfqg->saved_workload_slice = 0; + + /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. */ @@ -3412,6 +3432,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) { struct cfq_io_context *cic = cfqd->active_cic; + /* If the queue already has requests, don't wait */ + if (!RB_EMPTY_ROOT(&cfqq->sort_list)) + return false; + /* If there are other queues in the group, don't wait */ if (cfqq->cfqg->nr_cfqq > 1) return false; @@ -3681,10 +3705,10 @@ new_queue: } cfqq->allocated[rw]++; - atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); + cfqq->ref++; rq->elevator_private[0] = cic; rq->elevator_private[1] = cfqq; rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); @@ -3862,6 +3886,10 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; + /* + * Don't need take queue_lock in the routine, since we are + * initializing the ioscheduler, and nobody is using cfqd + */ cfqd->cic_index = i; /* Init root service tree */ @@ -3881,7 +3909,7 @@ static void *cfq_init_queue(struct request_queue *q) * Take a reference to root group which we never drop. This is just * to make sure that cfq_put_cfqg() does not try to kfree root group */ - atomic_set(&cfqg->ref, 1); + cfqg->ref = 1; rcu_read_lock(); cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 0); @@ -3901,7 +3929,7 @@ static void *cfq_init_queue(struct request_queue *q) * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - atomic_inc(&cfqd->oom_cfqq.ref); + cfqd->oom_cfqq.ref++; cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); diff --git a/block/genhd.c b/block/genhd.c index 5fa2b44a72ff..6a5b772aa201 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> #include <linux/mutex.h> #include <linux/idr.h> +#include <linux/log2.h> #include "blk.h" @@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -239,7 +244,7 @@ static struct blk_major_name { } *major_names[BLKDEV_MAJOR_HASH_SIZE]; /* index in the above - for now: assume no multimajor ranges */ -static inline int major_to_index(int major) +static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } @@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data) return 0; } +void register_disk(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; + int err; + + ddev->parent = disk->driverfs_dev; + + dev_set_name(ddev, disk->disk_name); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } + } + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* No minors to use for partitions */ + if (!disk_partitionable(disk)) + goto exit; + + /* No such device (e.g., media were just removed) */ + if (!get_capacity(disk)) + goto exit; + + bdev = bdget_disk(disk, 0); + if (!bdev) + goto exit; + + bdev->bd_invalidated = 1; + err = blkdev_get(bdev, FMODE_READ, NULL); + if (err < 0) + goto exit; + blkdev_put(bdev, FMODE_READ); + +exit: + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * add_disk - add partitioning information to kernel list * @disk: per-device partitioning information @@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk) retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); -} + disk_add_events(disk); +} EXPORT_SYMBOL(add_disk); -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ -void unlink_gendisk(struct gendisk *disk) +void del_gendisk(struct gendisk *disk) { + struct disk_part_iter piter; + struct hd_struct *part; + + disk_del_events(disk); + + /* invalidate stuff */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); + } + disk_part_iter_exit(&piter); + + invalidate_partition(disk, 0); + blk_free_devt(disk_to_dev(disk)->devt); + set_capacity(disk, 0); + disk->flags &= ~GENHD_FL_UP; + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + disk->driverfs_dev = NULL; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + device_del(disk_to_dev(disk)); } +EXPORT_SYMBOL(del_gendisk); /** * get_gendisk - get partitioning information for a given device @@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static void *p; p = disk_seqf_start(seqf, pos); - if (!IS_ERR(p) && p && !*pos) + if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } @@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); @@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ -static void media_change_notify_thread(struct work_struct *work) -{ - struct gendisk *gd = container_of(work, struct gendisk, async_notify); - char event[] = "MEDIA_CHANGE=1"; - char *envp[] = { event, NULL }; - - /* - * set enviroment vars to indicate which event this is for - * so that user space will know to go check the media status. - */ - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); - put_device(gd->driverfs_dev); -} - -#if 0 -void genhd_media_change_notify(struct gendisk *disk) -{ - get_device(disk->driverfs_dev); - schedule_work(&disk->async_notify); -} -EXPORT_SYMBOL_GPL(genhd_media_change_notify); -#endif /* 0 */ - dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1193,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id) } disk->part_tbl->part[0] = &disk->part0; + hd_ref_init(&disk->part0); + disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); - INIT_WORK(&disk->async_notify, - media_change_notify_thread); } return disk; } @@ -1291,3 +1362,422 @@ int invalidate_partition(struct gendisk *disk, int partno) } EXPORT_SYMBOL(invalidate_partition); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs = 0; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll iff there are events which + * can't be monitored asynchronously. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->events & ~disk->async_events) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +static void __disk_block_events(struct gendisk *disk, bool sync) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) { + if (sync) + cancel_delayed_work_sync(&disk->ev->dwork); + else + cancel_delayed_work(&disk->ev->dwork); + } +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + /* + * Not exactly a latency critical operation, set poll timer + * slack to 25% and kick event check. + */ + intv = disk_events_poll_jiffies(disk); + set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_block_events(disk, true); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, true); +} + +/** + * disk_check_events - schedule immediate event checking + * @disk: disk to check events for + * + * Schedule immediate event checking on @disk if not blocked. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_check_events(struct gendisk *disk) +{ + if (disk->ev) { + __disk_block_events(disk, false); + __disk_unblock_events(disk, true); + } +} +EXPORT_SYMBOL_GPL(disk_check_events); + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and clearted + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + const struct block_device_operations *bdops = disk->fops; + struct disk_events *ev = disk->ev; + unsigned int pending; + + if (!ev) { + /* for drivers still using the old ->media_changed method */ + if ((mask & DISK_EVENT_MEDIA_CHANGE) && + bdops->media_changed && bdops->media_changed(disk)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; + } + + /* tell the workfn about the events being cleared */ + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + spin_unlock_irq(&ev->lock); + + /* uncondtionally schedule event check and wait for it to finish */ + __disk_block_events(disk, true); + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + flush_delayed_work(&ev->dwork); + __disk_unblock_events(disk, false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + + return pending; +} + +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = ev->clearing; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + ev->clearing &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* tell userland about new events */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return __disk_events_show(disk->async_events, buf); +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + __disk_block_events(disk, true); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + + return count; +} + +static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + + list_for_each_entry(ev, &disk_events, node) + disk_check_events(ev->disk); + + mutex_unlock(&disk_events_mutex); + + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{add|del|release}_events - initialize and destroy disk_events. + */ +static void disk_add_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !(disk->events | disk->async_events)) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + if (sysfs_create_files(&disk_to_dev(disk)->kobj, + disk_events_attrs) < 0) { + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + kfree(ev); + return; + } + + disk->ev = ev; + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + mutex_lock(&disk_events_mutex); + list_add_tail(&ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +static void disk_del_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + __disk_block_events(disk, true); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +static void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/ioctl.c b/block/ioctl.c index a9a302eba01e..9049d460fa89 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EINVAL; if (get_user(n, (int __user *) arg)) return -EFAULT; - if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) + if (!(mode & FMODE_EXCL) && + blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) return -EBUSY; ret = set_blocksize(bdev, n); if (!(mode & FMODE_EXCL)) - bd_release(bdev); + blkdev_put(bdev, mode | FMODE_EXCL); return ret; case BLKPG: ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); |