diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 97 |
1 files changed, 75 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@ * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] */ /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) sector_t stripe; int dev; int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); /* now calculate first sector/dev */ chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) /* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; + int set; sector_t s = sector; - r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; slot++; for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; d += geo->near_copies; - if (d >= geo->raid_disks) - d -= geo->raid_disks; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * or recovery, so reshape isn't happening */ struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } offset = sector & geo->chunk_mask; if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) chunk = sector >> geo->chunk_shift; fc = sector_div(chunk, geo->far_copies); dev -= fc * geo->near_copies; - if (dev < 0) - dev += geo->raid_disks; + if (dev < far_set_start) + dev += far_set_size; } else { while (sector >= geo->stride) { sector -= geo->stride; - if (dev < geo->near_copies) - dev += geo->raid_disks - geo->near_copies; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; else dev -= geo->near_copies; } @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1460,7 +1508,8 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1502,7 +1551,8 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 17) + if (layout >> 18) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; + geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc; @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |