From df0793abb929e66606fa25f3875ff1b89de5ad32 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 19 Jan 2012 09:20:09 +0100 Subject: block,cfq: change code order cfq_slice_expired will change saved_workload_slice. It should be called first so saved_workload_slice is correctly set to 0 after workload type is changed. This fixes the code order changed by 54b466e44b1c7. Tested-by: Tetsuo Handa Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee55019066a1..da21c24dbed3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3117,17 +3117,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, */ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + enum wl_type_t old_type = cfqq_type(cfqd->active_queue); + cfq_log_cfqq(cfqd, cfqq, "preempt"); + cfq_slice_expired(cfqd, 1); /* * workload type is changed, don't save slice, otherwise preempt * doesn't happen */ - if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq)) + if (old_type != cfqq_type(cfqq)) cfqq->cfqg->saved_workload_slice = 0; - cfq_slice_expired(cfqd, 1); - /* * Put the new queue at the front of the of the current list, * so we know that it will be selected next. -- cgit v1.2.3 From 05c30b9551f1904d9950ad0d28e65fc4ff3c8a8e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 19 Jan 2012 09:20:10 +0100 Subject: block: fix NULL icq_cache reference Vivek reported a kernel crash: [ 94.217015] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c [ 94.218004] IP: [] kmem_cache_free+0x5e/0x200 [ 94.218004] PGD 13abda067 PUD 137d52067 PMD 0 [ 94.218004] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 94.218004] CPU 0 [ 94.218004] Modules linked in: [last unloaded: scsi_wait_scan] [ 94.218004] [ 94.218004] Pid: 0, comm: swapper/0 Not tainted 3.2.0+ #16 Hewlett-Packard HP xw6600 Workstation/0A9Ch [ 94.218004] RIP: 0010:[] [] kmem_cache_free+0x5e/0x200 [ 94.218004] RSP: 0018:ffff88013fc03de0 EFLAGS: 00010006 [ 94.218004] RAX: ffffffff81e0d020 RBX: ffff880138b3c680 RCX: 00000001801c001b [ 94.218004] RDX: 00000000003aac1d RSI: ffff880138b3c680 RDI: ffffffff81142fae [ 94.218004] RBP: ffff88013fc03e10 R08: ffff880137830238 R09: 0000000000000001 [ 94.218004] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 94.218004] R13: ffffea0004e2cf00 R14: ffffffff812f6eb6 R15: 0000000000000246 [ 94.218004] FS: 0000000000000000(0000) GS:ffff88013fc00000(0000) knlGS:0000000000000000 [ 94.218004] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 94.218004] CR2: 000000000000001c CR3: 00000001395ab000 CR4: 00000000000006f0 [ 94.218004] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 94.218004] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 94.218004] Process swapper/0 (pid: 0, threadinfo ffffffff81e00000, task ffffffff81e0d020) [ 94.218004] Stack: [ 94.218004] 0000000000000102 ffff88013fc0db20 ffffffff81e22700 ffff880139500f00 [ 94.218004] 0000000000000001 000000000000000a ffff88013fc03e20 ffffffff812f6eb6 [ 94.218004] ffff88013fc03e90 ffffffff810c8da2 ffffffff81e01fd8 ffff880137830240 [ 94.218004] Call Trace: [ 94.218004] [ 94.218004] [] icq_free_icq_rcu+0x16/0x20 [ 94.218004] [] __rcu_process_callbacks+0x1c2/0x420 [ 94.218004] [] rcu_process_callbacks+0x38/0x250 [ 94.218004] [] __do_softirq+0xce/0x3e0 [ 94.218004] [] ? clockevents_program_event+0x74/0x100 [ 94.218004] [] ? tick_program_event+0x24/0x30 [ 94.218004] [] call_softirq+0x1c/0x30 [ 94.218004] [] do_softirq+0x8d/0xc0 [ 94.218004] [] irq_exit+0xae/0xe0 [ 94.218004] [] smp_apic_timer_interrupt+0x6e/0x99 [ 94.218004] [] apic_timer_interrupt+0x70/0x80 Once a queue is quiesced, it's not supposed to have any elvpriv data or icq's, and elevator switching depends on that. Request alloc path followed the rule for elvpriv data but forgot apply it to icq's leading to the following crash during elevator switch. Fix it by not allocating icq's if ELVPRIV is not set for the request. Reported-by: Vivek Goyal Tested-by: Vivek Goyal Signed-off-by: Shaohua Li Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e6c05a97ee2b..636702575118 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -872,13 +872,15 @@ retry: spin_unlock_irq(q->queue_lock); /* create icq if missing */ - if (unlikely(et->icq_cache && !icq)) + if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { icq = ioc_create_icq(q, gfp_mask); + if (!icq) + goto fail_icq; + } - /* rqs are guaranteed to have icq on elv_set_request() if requested */ - if (likely(!et->icq_cache || icq)) - rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); +fail_icq: if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything -- cgit v1.2.3 From 9fa73472ddbcd3da87d35a7f4566eaaf345f798e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 6 Feb 2012 08:57:29 +0100 Subject: block: fix ioc locking warning Meelis reported a warning: WARNING: at kernel/timer.c:1122 run_timer_softirq+0x199/0x1ec() Hardware name: 939Dual-SATA2 timer: cfq_idle_slice_timer+0x0/0xaa preempt leak: 00000102 -> 00000103 Modules linked in: sr_mod cdrom videodev media drm_kms_helper ohci_hcd ehci_hcd v4l2_compat_ioctl32 usbcore i2c_ali15x3 snd_seq drm snd_timer snd_seq Pid: 0, comm: swapper Not tainted 3.3.0-rc2-00110-gd125666 #176 Call Trace: [] warn_slowpath_common+0x7e/0x96 [] ? cfq_slice_expired+0x1d/0x1d [] warn_slowpath_fmt+0x41/0x43 [] ? cfq_idle_slice_timer+0xa1/0xaa [] ? cfq_slice_expired+0x1d/0x1d [] run_timer_softirq+0x199/0x1ec [] ? timekeeping_get_ns+0x12/0x31 [] ? apic_write+0x11/0x13 [] __do_softirq+0x74/0xfa [] call_softirq+0x1a/0x30 [] do_softirq+0x31/0x68 [] irq_exit+0x3d/0xa3 [] smp_apic_timer_interrupt+0x6b/0x77 [] apic_timer_interrupt+0x69/0x70 [] ? sched_clock_cpu+0x73/0x7d [] ? sched_clock_cpu+0x73/0x7d [] ? default_idle+0x1e/0x32 [] ? default_idle+0x18/0x32 [] cpu_idle+0x87/0xd1 [] rest_init+0x85/0x89 [] start_kernel+0x2eb/0x2f8 [] x86_64_start_reservations+0x7e/0x82 [] x86_64_start_kernel+0xf0/0xf7 this_q == locked_q is possible. There are two problems here: 1. In UP case, there is preemption counter issue as spin_trylock always successes. 2. In SMP case, the loop breaks too earlier. Signed-off-by: Shaohua Li Reported-by: Meelis Roos Reported-by: Knut Petersen Tested-by: Knut Petersen Signed-off-by: Jens Axboe --- block/blk-ioc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 27a06e00eaec..7490b6da2453 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -204,7 +204,9 @@ void put_io_context(struct io_context *ioc, struct request_queue *locked_q) spin_unlock(last_q->queue_lock); last_q = NULL; - if (!spin_trylock(this_q->queue_lock)) + /* spin_trylock() always successes in UP case */ + if (this_q != locked_q && + !spin_trylock(this_q->queue_lock)) break; last_q = this_q; continue; -- cgit v1.2.3 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 2 +- block/blk-ioc.c | 92 +++++++---------------------------------------------- block/cfq-iosched.c | 2 +- 4 files changed, 14 insertions(+), 84 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..75642a352a8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1659,7 +1659,7 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc, NULL); + put_io_context(ioc); } } } diff --git a/block/blk-core.c b/block/blk-core.c index 636702575118..532b3a21b383 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -642,7 +642,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc, q); + put_io_context(rq->elv.icq->ioc); } mempool_free(rq, q->rq.rq_pool); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 7490b6da2453..9884fd7427fe 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,21 +29,6 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -/* - * Releasing ioc may nest into another put_io_context() leading to nested - * fast path release. As the ioc's can't be the same, this is okay but - * makes lockdep whine. Keep track of nesting and use it as subclass. - */ -#ifdef CONFIG_LOCKDEP -#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) -#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ -#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- -#else -#define ioc_release_depth(q) 0 -#define ioc_release_depth_inc(q) do { } while (0) -#define ioc_release_depth_dec(q) do { } while (0) -#endif - static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); @@ -75,11 +60,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) { - ioc_release_depth_inc(q); + if (et->ops.elevator_exit_icq_fn) et->ops.elevator_exit_icq_fn(icq); - ioc_release_depth_dec(q); - } /* * @icq->q might have gone away by the time RCU callback runs @@ -149,81 +131,29 @@ static void ioc_release_fn(struct work_struct *work) /** * put_io_context - put a reference of io_context * @ioc: io_context to put - * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. If the caller is holding queue_lock of a queue, it can indicate - * that with @locked_q. This is an optimization hint and the caller is - * allowed to pass in %NULL even when it's holding a queue_lock. + * zero. */ -void put_io_context(struct io_context *ioc, struct request_queue *locked_q) +void put_io_context(struct io_context *ioc) { - struct request_queue *last_q = locked_q; unsigned long flags; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - if (locked_q) - lockdep_assert_held(locked_q->queue_lock); - - if (!atomic_long_dec_and_test(&ioc->refcount)) - return; /* - * Destroy @ioc. This is a bit messy because icq's are chained - * from both ioc and queue, and ioc->lock nests inside queue_lock. - * The inner ioc->lock should be held to walk our icq_list and then - * for each icq the outer matching queue_lock should be grabbed. - * ie. We need to do reverse-order double lock dancing. - * - * Another twist is that we are often called with one of the - * matching queue_locks held as indicated by @locked_q, which - * prevents performing double-lock dance for other queues. - * - * So, we do it in two stages. The fast path uses the queue_lock - * the caller is holding and, if other queues need to be accessed, - * uses trylock to avoid introducing locking dependency. This can - * handle most cases, especially if @ioc was performing IO on only - * single device. - * - * If trylock doesn't cut it, we defer to @ioc->release_work which - * can do all the double-locking dancing. + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. */ - spin_lock_irqsave_nested(&ioc->lock, flags, - ioc_release_depth(locked_q)); - - while (!hlist_empty(&ioc->icq_list)) { - struct io_cq *icq = hlist_entry(ioc->icq_list.first, - struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - last_q = NULL; - - /* spin_trylock() always successes in UP case */ - if (this_q != locked_q && - !spin_trylock(this_q->queue_lock)) - break; - last_q = this_q; - continue; - } - ioc_exit_icq(icq); + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + schedule_work(&ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); } - - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - - spin_unlock_irqrestore(&ioc->lock, flags); - - /* if no icq is left, we're done; otherwise, kick release_work */ - if (hlist_empty(&ioc->icq_list)) - kmem_cache_free(iocontext_cachep, ioc); - else - schedule_work(&ioc->release_work); } EXPORT_SYMBOL(put_io_context); @@ -238,7 +168,7 @@ void exit_io_context(struct task_struct *task) task_unlock(task); atomic_dec(&ioc->nr_tasks); - put_io_context(ioc, NULL); + put_io_context(ioc); } /** diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da21c24dbed3..5684df6848bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1794,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } -- cgit v1.2.3 From 050c8ea80e3e90019d9e981c6a117ef614e882ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2012 09:19:38 +0100 Subject: block: separate out blk_rq_merge_ok() and blk_try_merge() from elevator functions blk_rq_merge_ok() is the elevator-neutral part of merge eligibility test. blk_try_merge() determines merge direction and expects the caller to have tested elv_rq_merge_ok() previously. elv_rq_merge_ok() now wraps blk_rq_merge_ok() and then calls elv_iosched_allow_merge(). elv_try_merge() is removed and the two callers are updated to call elv_rq_merge_ok() explicitly followed by blk_try_merge(). While at it, make rq_merge_ok() functions return bool. This is to prepare for plug merge update and doesn't introduce any behavior change. This is based on Jens' patch to skip elevator_allow_merge_fn() from plug merge. Signed-off-by: Tejun Heo LKML-Reference: <4F16F3CA.90904@kernel.dk> Original-patch-by: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/blk-merge.c | 37 +++++++++++++++++++++++++++++++++++++ block/blk.h | 2 ++ block/elevator.c | 55 ++++--------------------------------------------------- 4 files changed, 45 insertions(+), 53 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 532b3a21b383..fa697bf691eb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1282,10 +1282,10 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, (*request_count)++; - if (rq->q != q) + if (rq->q != q || !elv_rq_merge_ok(rq, bio)) continue; - el_ret = elv_try_merge(rq, bio); + el_ret = blk_try_merge(rq, bio); if (el_ret == ELEVATOR_BACK_MERGE) { ret = bio_attempt_back_merge(q, rq, bio); if (ret) diff --git a/block/blk-merge.c b/block/blk-merge.c index cfcc37cb222b..160035f54882 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -471,3 +471,40 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, { return attempt_merge(q, rq, next); } + +bool blk_rq_merge_ok(struct request *rq, struct bio *bio) +{ + if (!rq_mergeable(rq)) + return false; + + /* don't merge file system requests and discard requests */ + if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) + return false; + + /* don't merge discard requests and secure discard requests */ + if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) + return false; + + /* different data direction or already started, don't merge */ + if (bio_data_dir(bio) != rq_data_dir(rq)) + return false; + + /* must be same device and not a special request */ + if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) + return false; + + /* only merge integrity protected bio into ditto rq */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return false; + + return true; +} + +int blk_try_merge(struct request *rq, struct bio *bio) +{ + if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector) + return ELEVATOR_BACK_MERGE; + else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector) + return ELEVATOR_FRONT_MERGE; + return ELEVATOR_NO_MERGE; +} diff --git a/block/blk.h b/block/blk.h index 7efd772336de..9c12f80882b0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -137,6 +137,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); +bool blk_rq_merge_ok(struct request *rq, struct bio *bio); +int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 91e18f8af9be..f016855a46b0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -70,39 +70,9 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) /* * can we safely merge with this request? */ -int elv_rq_merge_ok(struct request *rq, struct bio *bio) +bool elv_rq_merge_ok(struct request *rq, struct bio *bio) { - if (!rq_mergeable(rq)) - return 0; - - /* - * Don't merge file system requests and discard requests - */ - if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) - return 0; - - /* - * Don't merge discard requests and secure discard requests - */ - if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) - return 0; - - /* - * different data direction or already started, don't merge - */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return 0; - - /* - * must be same device and not a special request - */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) - return 0; - - /* - * only merge integrity protected bio into ditto rq - */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (!blk_rq_merge_ok(rq, bio)) return 0; if (!elv_iosched_allow_merge(rq, bio)) @@ -112,23 +82,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_rq_merge_ok); -int elv_try_merge(struct request *__rq, struct bio *bio) -{ - int ret = ELEVATOR_NO_MERGE; - - /* - * we can merge and sequence is ok, check if it's possible - */ - if (elv_rq_merge_ok(__rq, bio)) { - if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) - ret = ELEVATOR_BACK_MERGE; - else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) - ret = ELEVATOR_FRONT_MERGE; - } - - return ret; -} - static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; @@ -478,8 +431,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) /* * First try one-hit cache. */ - if (q->last_merge) { - ret = elv_try_merge(q->last_merge, bio); + if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { + ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; -- cgit v1.2.3 From 07c2bd37350c9b1af71b35d05f16e300a6602948 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2012 09:19:42 +0100 Subject: block: don't call elevator callbacks for plug merges Plug merge calls two elevator callbacks outside queue lock - elevator_allow_merge_fn() and elevator_bio_merged_fn(). Although attempt_plug_merge() suggests that elevator is guaranteed to be there through the existing request on the plug list, nothing prevents plug merge from calling into dying or initializing elevator. For regular merges, bypass ensures elvpriv count to reach zero, which in turn prevents merges as all !ELVPRIV requests get REQ_SOFTBARRIER from forced back insertion. Plug merge doesn't check ELVPRIV, and, as the requests haven't gone through elevator insertion yet, it doesn't have SOFTBARRIER set allowing merges on a bypassed queue. This, for example, leads to the following crash during elevator switch. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] cfq_allow_merge+0x49/0xa0 PGD 112cbc067 PUD 115d5c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP CPU 1 Modules linked in: deadline_iosched Pid: 819, comm: dd Not tainted 3.3.0-rc2-work+ #76 Bochs Bochs RIP: 0010:[] [] cfq_allow_merge+0x49/0xa0 RSP: 0018:ffff8801143a38f8 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffff88011817ce28 RCX: ffff880116eb6cc0 RDX: 0000000000000000 RSI: ffff880118056e20 RDI: ffff8801199512f8 RBP: ffff8801143a3908 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff880118195708 R13: ffff880118052aa0 R14: ffff8801143a3d50 R15: ffff880118195708 FS: 00007f19f82cb700(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000112c6a000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process dd (pid: 819, threadinfo ffff8801143a2000, task ffff880116eb6cc0) Stack: ffff88011817ce28 ffff880118195708 ffff8801143a3928 ffffffff81391bba ffff88011817ce28 ffff880118195708 ffff8801143a3948 ffffffff81391bf1 ffff88011817ce28 0000000000000000 ffff8801143a39a8 ffffffff81398e3e Call Trace: [] elv_rq_merge_ok+0x4a/0x60 [] elv_try_merge+0x21/0x40 [] blk_queue_bio+0x8e/0x390 [] generic_make_request+0xca/0x100 [] submit_bio+0x74/0x100 [] __blockdev_direct_IO+0x1ce2/0x3450 [] blkdev_direct_IO+0x57/0x60 [] generic_file_aio_read+0x6d5/0x760 [] do_sync_read+0xe2/0x120 [] vfs_read+0xc5/0x180 [] sys_read+0x51/0x90 [] system_call_fastpath+0x16/0x1b There are multiple ways to fix this including making plug merge check ELVPRIV; however, * Calling into elevator outside queue lock is confusing and error-prone. * Requests on plug list aren't known to the elevator. They aren't on the elevator yet, so there's no elevator specific state to update. * Given the nature of plug merges - collecting bio's for the same purpose from the same issuer - elevator specific restrictions aren't applicable. So, simply don't call into elevator methods from plug merge by moving elv_bio_merged() from bio_attempt_*_merge() to blk_queue_bio(), and using blk_try_merge() in attempt_plug_merge(). This is based on Jens' patch to skip elevator_allow_merge_fn() from plug merge. Note that this makes per-cgroup merged stats skip plug merging. Signed-off-by: Tejun Heo LKML-Reference: <4F16F3CA.90904@kernel.dk> Original-patch-by: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-core.c | 19 +++++++++---------- block/cfq-iosched.c | 15 ++++----------- 2 files changed, 13 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index fa697bf691eb..3a78b00edd71 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1212,7 +1212,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); return true; } @@ -1243,7 +1242,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); return true; } @@ -1257,13 +1255,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, * on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * - * This function is called without @q->queue_lock; however, elevator is - * accessed iff there already are requests on the plugged list which in - * turn guarantees validity of the elevator. - * - * Note that, on successful merge, elevator operation - * elevator_bio_merged_fn() will be called without queue lock. Elevator - * must be ready for this. + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. */ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1282,7 +1279,7 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, (*request_count)++; - if (rq->q != q || !elv_rq_merge_ok(rq, bio)) + if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; el_ret = blk_try_merge(rq, bio); @@ -1347,12 +1344,14 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) el_ret = elv_merge(q, &req, bio); if (el_ret == ELEVATOR_BACK_MERGE) { if (bio_attempt_back_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; } } else if (el_ret == ELEVATOR_FRONT_MERGE) { if (bio_attempt_front_merge(q, req, bio)) { + elv_bio_merged(q, req, bio); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5684df6848bc..d0ba50533668 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1699,18 +1699,11 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, /* * Lookup the cfqq that this bio will be queued with and allow - * merge only if rq is queued there. This function can be called - * from plug merge without queue_lock. In such cases, ioc of @rq - * and %current are guaranteed to be equal. Avoid lookup which - * requires queue_lock by using @rq's cic. + * merge only if rq is queued there. */ - if (current->io_context == RQ_CIC(rq)->icq.ioc) { - cic = RQ_CIC(rq); - } else { - cic = cfq_cic_lookup(cfqd, current->io_context); - if (!cic) - return false; - } + cic = cfq_cic_lookup(cfqd, current->io_context); + if (!cic) + return false; cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); return cfqq == RQ_CFQQ(rq); -- cgit v1.2.3 From 37b40adf2d1b4a5e51323be73ccf8ddcf3f15dd3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 8 Feb 2012 20:02:03 +0100 Subject: bsg: fix sysfs link remove warning We create "bsg" link if q->kobj.sd is not NULL, so remove it only when the same condition is true. Fixes: WARNING: at fs/sysfs/inode.c:323 sysfs_hash_and_remove+0x2b/0x77() sysfs: can not remove 'bsg', no directory Call Trace: [] warn_slowpath_common+0x6a/0x7f [] ? sysfs_hash_and_remove+0x2b/0x77 [] warn_slowpath_fmt+0x2b/0x2f [] sysfs_hash_and_remove+0x2b/0x77 [] sysfs_remove_link+0x20/0x23 [] bsg_unregister_queue+0x40/0x6d [] __scsi_remove_device+0x31/0x9d [] scsi_forget_host+0x41/0x52 [] scsi_remove_host+0x71/0xe0 [] quiesce_and_remove_host+0x51/0x83 [usb_storage] [] usb_stor_disconnect+0x18/0x22 [usb_storage] [] usb_unbind_interface+0x4e/0x109 [] __device_release_driver+0x6b/0xa6 [] device_release_driver+0x17/0x22 [] bus_remove_device+0xd6/0xe6 [] device_del+0xf2/0x137 [] usb_disable_device+0x94/0x1a0 Signed-off-by: Stanislaw Gruszka Signed-off-by: Jens Axboe --- block/bsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 4cf703fd98bb..ff64ae3bacee 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -983,7 +983,8 @@ void bsg_unregister_queue(struct request_queue *q) mutex_lock(&bsg_mutex); idr_remove(&bsg_minor_idr, bcd->minor); - sysfs_remove_link(&q->kobj, "bsg"); + if (q->kobj.sd) + sysfs_remove_link(&q->kobj, "bsg"); device_unregister(bcd->class_dev); bcd->class_dev = NULL; kref_put(&bcd->ref, bsg_kref_release_function); -- cgit v1.2.3 From d8c66c5d59247e25a69428aced0b79d33b9c66d6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 11 Feb 2012 12:37:25 +0100 Subject: block: fix lockdep warning on io_context release put_io_context() 11a3122f6c "block: strip out locking optimization in put_io_context()" removed ioc_lock depth lockdep annoation along with locking optimization; however, while recursing from put_io_context() is no longer possible, ioc_release_fn() may still end up putting the last reference of another ioc through elevator, which wlil grab ioc->lock triggering spurious (as the ioc is always different one) A-A deadlock warning. As this can only happen one time from ioc_release_fn(), using non-zero subclass from ioc_release_fn() is enough. Use subclass 1. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-ioc.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9884fd7427fe..8b782a63c297 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -80,8 +80,15 @@ static void ioc_release_fn(struct work_struct *work) struct io_context *ioc = container_of(work, struct io_context, release_work); struct request_queue *last_q = NULL; + unsigned long flags; - spin_lock_irq(&ioc->lock); + /* + * Exiting icq may call into put_io_context() through elevator + * which will trigger lockdep warning. The ioc's are guaranteed to + * be different, use a different locking subclass here. Use + * irqsave variant as there's no spin_lock_irq_nested(). + */ + spin_lock_irqsave_nested(&ioc->lock, flags, 1); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -103,15 +110,15 @@ static void ioc_release_fn(struct work_struct *work) */ if (last_q) { spin_unlock(last_q->queue_lock); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); blk_put_queue(last_q); } else { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); } last_q = this_q; - spin_lock_irq(this_q->queue_lock); - spin_lock(&ioc->lock); + spin_lock_irqsave(this_q->queue_lock, flags); + spin_lock_nested(&ioc->lock, 1); continue; } ioc_exit_icq(icq); @@ -119,10 +126,10 @@ static void ioc_release_fn(struct work_struct *work) if (last_q) { spin_unlock(last_q->queue_lock); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); blk_put_queue(last_q); } else { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); } kmem_cache_free(iocontext_cachep, ioc); -- cgit v1.2.3 From 97387e3baaf3c35ad560f8878e943c720a77da1b Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 24 Feb 2012 09:37:42 +0000 Subject: LDM: Fix reassembly of extended VBLKs. From: Ben Hutchings Extended VBLKs (those larger than the preset VBLK size) are divided into fragments, each with its own VBLK header. Our LDM implementation generally assumes that each VBLK is contiguous in memory, so these fragments must be assembled before further processing. Currently the reassembly seems to be done quite wrongly - no VBLK header is copied into the contiguous buffer, and the length of the header is subtracted twice from each fragment. Also the total length of the reassembled VBLK is calculated incorrectly. Signed-off-by: Ben Hutchings Signed-off-by: Anton Altaparmakov --- block/partitions/ldm.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index bd8ae788f689..e507cfbd044e 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -2,7 +2,7 @@ * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) * * Copyright (C) 2001,2002 Richard Russon - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2012 Anton Altaparmakov * Copyright (C) 2001,2002 Jakob Kemi * * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads @@ -1341,20 +1341,17 @@ found: ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); return false; } - if (f->map & (1 << rec)) { ldm_error ("Duplicate VBLK, part %d.", rec); f->map &= 0x7F; /* Mark the group as broken */ return false; } - f->map |= (1 << rec); - + if (!rec) + memcpy(f->data, data, VBLK_SIZE_HEAD); data += VBLK_SIZE_HEAD; size -= VBLK_SIZE_HEAD; - - memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); - + memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); return true; } -- cgit v1.2.3