From 6d46964230d182c4b6097379738849a809d791dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Nov 2018 17:02:18 +0100 Subject: block: remove the lock argument to blk_alloc_queue_node With the legacy request path gone there is no real need to override the queue_lock. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c510179a7f84..a733e4c920af 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1896,7 +1896,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; md->queue->queuedata = md; -- cgit v1.2.3 From 892ad71f622bbf39c6de321d5ca9b0fdec237c24 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:30 -0500 Subject: dm: set the static flush bio device on demand The next patch changes the macro bio_set_dev() to associate a bio with a blkg based on the device set. However, dm creates a static bio to be used as the basis for cloning empty flush bios on creation. The bio_set_dev() call in alloc_dev() will cause problems with the next patch adding association to bio_set_dev() because the call is before the bdev is associated with a gendisk (bd_disk is %NULL). To get around this, set the device on the static bio every time and use that to clone to the other bios. Signed-off-by: Dennis Zhou Acked-by: Mike Snitzer Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- drivers/md/dm.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a733e4c920af..ab72d79775ee 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1417,10 +1417,21 @@ static int __send_empty_flush(struct clone_info *ci) unsigned target_nr = 0; struct dm_target *ti; + /* + * Empty flush uses a statically initialized bio, &md->flush_bio, as + * the base for cloning. However, blkg association requires that a + * bdev is associated with a gendisk, which doesn't happen until the + * bdev is opened. So, blkg association is done at issue time of the + * flush rather than when the device is created in alloc_dev(). + */ + bio_set_dev(ci->bio, ci->io->md->bdev); + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bio_disassociate_blkg(ci->bio); + return 0; } @@ -1939,7 +1950,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad; bio_init(&md->flush_bio, NULL, 0); - bio_set_dev(&md->flush_bio, md->bdev); md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; dm_stats_init(&md->stats); -- cgit v1.2.3 From 80a787ba3809deb694ee632919badb73890daf05 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 6 Dec 2018 11:41:16 -0500 Subject: dm: dont rewrite dm_disk(md)->part0.in_flight generic_start_io_acct and generic_end_io_acct already update the variable in_flight using atomic operations, so we don't have to overwrite them again. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ab72d79775ee..33100e536c4e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -663,8 +663,7 @@ static void start_io_acct(struct dm_io *io) generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); + atomic_inc(&md->pending[rw]); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), @@ -693,7 +692,6 @@ static void end_io_acct(struct dm_io *io) * a flush. */ pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); pending += atomic_read(&md->pending[rw^0x1]); /* nudge anyone waiting on suspend queue */ -- cgit v1.2.3 From 6f75723190d88e1319bea623bfe0292bf3917965 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 6 Dec 2018 11:41:22 -0500 Subject: dm: remove the pending IO accounting Remove the "pending" atomic counters, that duplicate block-core's in_flight counters, and update md_in_flight() to look at percpu in_flight counters. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 33100e536c4e..70568f8b6c53 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,25 +646,30 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -int md_in_flight(struct mapped_device *md) +static bool md_in_flight(struct mapped_device *md) { - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + + for_each_possible_cpu(cpu) { + if (part_stat_local_read_cpu(part, in_flight[0], cpu) || + part_stat_local_read_cpu(part, in_flight[1], cpu)) + return true; + } + + return false; } static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - int rw = bio_data_dir(bio); io->start_time = jiffies; generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_inc(&md->pending[rw]); - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -676,8 +681,6 @@ static void end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; - int pending; - int rw = bio_data_dir(bio); generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, io->start_time); @@ -687,16 +690,11 @@ static void end_io_acct(struct dm_io *io) bio->bi_iter.bi_sector, bio_sectors(bio), true, duration, &io->stats_aux); - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - pending += atomic_read(&md->pending[rw^0x1]); - /* nudge anyone waiting on suspend queue */ - if (!pending) - wake_up(&md->wait); + if (unlikely(waitqueue_active(&md->wait))) { + if (!md_in_flight(md)) + wake_up(&md->wait); + } } /* @@ -1915,8 +1913,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); -- cgit v1.2.3 From b7934ba4147a883f7a1b32c6408179274a4d6ed1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Dec 2018 15:45:53 -0700 Subject: dm: fix inflight IO check After switching to percpu inflight counters, the inflight check is totally buggy. It's perfectly valid for some counters to be non-zero while having a total inflight IO count of 0, that's how these kinds of counters work (inc on one CPU, dec on another). Fix the md_in_flight() check to sum all counters before returning a false positive, potentially. While at it, remove the inflight read for IO completion. We don't need it, just wake anyone that's waiting for the IO count to drop to zero. The caller needs to re-check that value anyway when woken, which it does. Fixes: 6f75723190d8 ("dm: remove the pending IO accounting") Acked-by: Mike Snitzer Reported-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 70568f8b6c53..79ad4b3d215c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -650,14 +650,14 @@ static bool md_in_flight(struct mapped_device *md) { int cpu; struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; for_each_possible_cpu(cpu) { - if (part_stat_local_read_cpu(part, in_flight[0], cpu) || - part_stat_local_read_cpu(part, in_flight[1], cpu)) - return true; + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); } - return false; + return sum != 0; } static void start_io_acct(struct dm_io *io) @@ -691,10 +691,8 @@ static void end_io_acct(struct dm_io *io) true, duration, &io->stats_aux); /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) { - if (!md_in_flight(md)) - wake_up(&md->wait); - } + if (unlikely(waitqueue_active(&md->wait))) + wake_up(&md->wait); } /* -- cgit v1.2.3 From c4576aed8d85d808cd6443bda58393d525207d01 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 11 Dec 2018 09:10:26 -0500 Subject: dm: fix request-based dm's use of dm_wait_for_completion The md->wait waitqueue is used by both bio-based and request-based DM. Commit dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO") lost sight of the requirement that dm_wait_for_completion() must work with all types of DM devices. Fix md_in_flight() to call the blk-mq or bio-based method accordingly. Fixes: dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO") Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 79ad4b3d215c..c414d40d645d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,7 +646,7 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -static bool md_in_flight(struct mapped_device *md) +static bool md_in_flight_bios(struct mapped_device *md) { int cpu; struct hd_struct *part = &dm_disk(md)->part0; @@ -660,6 +660,14 @@ static bool md_in_flight(struct mapped_device *md) return sum != 0; } +static bool md_in_flight(struct mapped_device *md) +{ + if (queue_is_mq(md->queue)) + return blk_mq_queue_busy(md->queue); + else + return md_in_flight_bios(md); +} + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; -- cgit v1.2.3 From 3c94d83cb352627f221d971b05f163c17527de74 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Dec 2018 21:11:17 -0700 Subject: blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() There's a single user of this function, dm, and dm just wants to check if IO is inflight, not that it's just allocated. This fixes a hang with srp/002 in blktests with dm, where it tries to suspend but waits for inflight IO to finish first. As it checks for just allocated requests, this fails. Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c414d40d645d..dddbca63e140 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -663,7 +663,7 @@ static bool md_in_flight_bios(struct mapped_device *md) static bool md_in_flight(struct mapped_device *md) { if (queue_is_mq(md->queue)) - return blk_mq_queue_busy(md->queue); + return blk_mq_queue_inflight(md->queue); else return md_in_flight_bios(md); } -- cgit v1.2.3 From dbe3ece1287dafe4113c64ada3113c39f344c64a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Dec 2018 09:13:34 -0700 Subject: dm: don't reuse bio for flushes DM currently has a statically allocated bio that it uses to issue empty flushes. It doesn't submit this bio, it just uses it for maintaining state while setting up clones. Multiple users can access this bio at the same time. This wasn't previously an issue, even if it was a bit iffy, but with the blkg associations it can become one. We setup the blkg association, then clone bio's and submit, then remove the blkg assocation again. But since we can have multiple tasks doing this at the same time, against multiple blkg's, then we can either lose references to a blkg, or put it twice. The latter causes complaints on the percpu ref being <= 0 when released, and can cause use-after-free as well. Ming reports that xfstest generic/475 triggers this: ------------[ cut here ]------------ percpu ref (blkg_release) <= 0 (0) after switching to atomic WARNING: CPU: 13 PID: 0 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x2c9/0x4a0 Switch to just using an on-stack bio for this, and get rid of the embedded bio. Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device") Reported-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dddbca63e140..f588a6a83d80 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1420,11 +1420,11 @@ static int __send_empty_flush(struct clone_info *ci) struct dm_target *ti; /* - * Empty flush uses a statically initialized bio, &md->flush_bio, as - * the base for cloning. However, blkg association requires that a - * bdev is associated with a gendisk, which doesn't happen until the - * bdev is opened. So, blkg association is done at issue time of the - * flush rather than when the device is created in alloc_dev(). + * Empty flush uses a statically initialized bio, as the base for + * cloning. However, blkg association requires that a bdev is + * associated with a gendisk, which doesn't happen until the bdev is + * opened. So, blkg association is done at issue time of the flush + * rather than when the device is created in alloc_dev(). */ bio_set_dev(ci->bio, ci->io->md->bdev); @@ -1609,7 +1609,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1665,7 +1674,16 @@ static blk_qc_t __process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1949,9 +1967,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad; - bio_init(&md->flush_bio, NULL, 0); - md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ -- cgit v1.2.3