From e9c7469bb4f502dafc092166201bea1ad5fc0fbf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:18 +0200 Subject: md: implment REQ_FLUSH/FUA support This patch converts md to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. In the core part (md.c), the following changes are notable. * Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with processing of other requests and thus there is no reason to mark the queue congested while FLUSH/FUA is in progress. * REQ_FLUSH/FUA failures are final and its users don't need retry logic. Retry logic is removed. * Preflush needs to be issued to all member devices but FUA writes can be handled the same way as other writes - their processing can be deferred to request_queue of member devices. md_barrier_request() is renamed to md_flush_request() and simplified accordingly. For linear, raid0 and multipath, the core changes are enough. raid1, 5 and 10 need the following conversions. * raid1: Handling of FLUSH/FUA bio's can simply be deferred to request_queues of member devices. Barrier related logic removed. * raid5: Queue draining logic dropped. FUA bit is propagated through biodrain and stripe resconstruction such that all the updated parts of the stripe are written out with FUA writes if any of the dirtying writes was FUA. preread_active_stripes handling in make_request() is updated as suggested by Neil Brown. * raid10: FUA bit needs to be propagated to write clones. linear, raid0, 1, 5 and 10 tested. Signed-off-by: Tejun Heo Reviewed-by: Neil Brown Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 176 ++++++++++++++++++----------------------------------- 1 file changed, 59 insertions(+), 117 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad83a4dcadc3..886a9d865488 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) if (r1_bio->bios[mirror] == bio) break; - if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { - set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_BarrierRetry, &r1_bio->state); - r1_bio->mddev->barriers_work = 0; - /* Don't rdev_dec_pending in this branch - keep it for the retry */ - } else { + /* + * 'one mirror IO has finished' event handler: + */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + if (!uptodate) { + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* - * this branch is our 'one mirror IO has finished' event handler: + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* In behind mode, we ACK the master bio once the I/O has safely - * reached all non-writemostly disks. Setting the Returned bit - * ensures that this gets done only once -- we don't ever want to - * return -EIO here, instead we'll wait */ - - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); - } + set_bit(R1BIO_Uptodate, &r1_bio->state); + + update_head_pos(mirror, r1_bio); + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, 0); } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + /* - * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } if (to_put) @@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - unsigned long do_barriers; + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. - * We test barriers_work *after* md_write_start as md_write_start - * may cause the first superblock write, and that will check out - * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ @@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) } finish_wait(&conf->wait_barrier, &w); } - if (unlikely(!mddev->barriers_work && - (bio->bi_rw & REQ_HARDBARRIER))) { - if (rw == WRITE) - md_write_end(mddev); - bio_endio(bio, -EOPNOTSUPP); - return 0; - } wait_barrier(conf); @@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio->bi_rw & REQ_HARDBARRIER; - if (do_barriers) - set_bit(R1BIO_Barrier, &r1_bio->state); - bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; @@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev) if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; - } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were REQ_HARDBARRIER - * requests which failed with -EOPNOTSUPP. Hohumm.. - * Better resubmit without the barrier. - * We know which devices to resubmit for, because - * all others have had their bios[] entry cleared. - * We already have a nr_pending reference on these rdevs. - */ - int i; - const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); - clear_bit(R1BIO_BarrierRetry, &r1_bio->state); - clear_bit(R1BIO_Barrier, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) - atomic_inc(&r1_bio->remaining); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) { - struct bio_vec *bvec; - int j; - - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - /* copy pages from the failed bio, as - * this might be a write-behind device */ - __bio_for_each_segment(bvec, bio, j, 0) - bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; - bio_put(r1_bio->bios[i]); - bio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | do_sync; - bio->bi_private = r1_bio; - r1_bio->bios[i] = bio; - generic_make_request(bio); - } } else { int disk; -- cgit v1.2.3 From 7571ae887d3b96d8e7ce63d43828866c6a91cdc6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Oct 2010 11:54:46 +1100 Subject: md/raid1: avoid overflow in raid1 resync when bitmap is in use. bitmap_start_sync returns - via a pass-by-reference variable - the number of sectors before we need to check with the bitmap again. Since commit ef4256733506f245 this number can be substantially larger, 2^27 is a common value. Unfortunately it is an 'int' and so when raid1.c:sync_request shifts it 9 places to the left it becomes 0. This results in a zero-length read which the scsi layer justifiably complains about. This patch just removes the shift so the common case becomes safe with a trivially-correct patch. In the next merge window we will convert this 'int' to a 'sector_t' Reported-by: "George Spelvin" Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad83a4dcadc3..fba4d2feaeb4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1912,7 +1912,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if (len > (sync_blocks<<9)) + if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } -- cgit v1.2.3 From db8d9d3591f77cc6b66248b2cdfa1c43deee5ec9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Oct 2010 12:00:50 +1100 Subject: md/raid1: minor bio initialisation improvements. When performing a resync we pre-allocate some bios and repeatedly use them. This requires us to re-initialise them each time. One field (bi_comp_cpu) and some flags weren't being initiaised reliably. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fba4d2feaeb4..0b830bbe1d8b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1839,7 +1839,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; + bio->bi_flags &= ~(BIO_POOL_MASK-1); bio->bi_flags |= 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; -- cgit v1.2.3