summaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-13 13:22:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-13 13:22:01 -0700
commit9db908806b85c1430150fbafe269a7b21b07d15d (patch)
tree3911759c93e0be26b6771e1a92b75612b206ffa5 /drivers/md/raid5.c
parent4d7127dace8cf4b05eb7c8c8531fc204fbb195f4 (diff)
parent72f36d5972a166197036c1281963f6863c429bf2 (diff)
Merge tag 'md-3.7' of git://neil.brown.name/md
Pull md updates from NeilBrown: - "discard" support, some dm-raid improvements and other assorted bits and pieces. * tag 'md-3.7' of git://neil.brown.name/md: (29 commits) md: refine reporting of resync/reshape delays. md/raid5: be careful not to resize_stripes too big. md: make sure manual changes to recovery checkpoint are saved. md/raid10: use correct limit variable md: writing to sync_action should clear the read-auto state. Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races md/raid5: make sure to_read and to_write never go negative. md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. md/raid5: protect debug message against NULL derefernce. md/raid5: add some missing locking in handle_failed_stripe. MD: raid5 avoid unnecessary zero page for trim MD: raid5 trim support md/bitmap:Don't use IS_ERR to judge alloc_page(). md/raid1: Don't release reference to device while handling read error. raid: replace list_for_each_continue_rcu with new interface add further __init annotations to crypto/xor.c DM RAID: Fix for "sync" directive ineffectiveness DM RAID: Fix comparison of index and quantity for "rebuild" parameter DM RAID: Add rebuild capability for RAID10 DM RAID: Move 'rebuild' checking code to its own function ...
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c219
1 files changed, 197 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0689173fd9f5..c5439dce0295 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -551,6 +551,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rw = WRITE_FUA;
else
rw = WRITE;
+ if (test_bit(R5_Discard, &sh->dev[i].flags))
+ rw |= REQ_DISCARD;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else if (test_and_clear_bit(R5_WantReplace,
@@ -1174,8 +1176,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_rw & REQ_SYNC)
set_bit(R5_SyncIO, &dev->flags);
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ if (wbi->bi_rw & REQ_DISCARD)
+ set_bit(R5_Discard, &dev->flags);
+ else
+ tx = async_copy_data(1, wbi, dev->page,
+ dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector);
}
}
@@ -1191,7 +1196,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
int i;
- bool fua = false, sync = false;
+ bool fua = false, sync = false, discard = false;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@@ -1199,13 +1204,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+ discard |= test_bit(R5_Discard, &sh->dev[i].flags);
}
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- set_bit(R5_UPTODATE, &dev->flags);
+ if (!discard)
+ set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
if (sync)
@@ -1241,6 +1248,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (pd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
@@ -1285,10 +1304,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
{
struct async_submit_ctl submit;
struct page **blocks = percpu->scribble;
- int count;
+ int count, i;
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+ for (i = 0; i < sh->disks; i++) {
+ if (sh->pd_idx == i || sh->qd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
+
count = set_syndrome_sources(blocks, sh);
atomic_inc(&sh->count);
@@ -2408,11 +2441,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
- spin_unlock_irq(&sh->stripe_lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap && firstwrite) {
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2479,10 +2512,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
spin_unlock_irq(&sh->stripe_lock);
- if (bi) {
- s->to_write--;
+ if (bi)
bitmap_end = 1;
- }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -2524,11 +2555,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
+ spin_lock_irq(&sh->stripe_lock);
bi = sh->dev[i].toread;
sh->dev[i].toread = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (bi) s->to_read--;
while (bi && bi->bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
@@ -2741,7 +2773,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags)) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_and_clear_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
@@ -2775,12 +2808,25 @@ static void handle_stripe_dirtying(struct r5conf *conf,
int disks)
{
int rmw = 0, rcw = 0, i;
- if (conf->max_degraded == 2) {
- /* RAID6 requires 'rcw' in current implementation
- * Calculate the real rcw later - for now fake it
+ sector_t recovery_cp = conf->mddev->recovery_cp;
+
+ /* RAID6 requires 'rcw' in current implementation.
+ * Otherwise, check whether resync is now happening or should start.
+ * If yes, then the array is dirty (after unclean shutdown or
+ * initial creation), so parity in some stripes might be inconsistent.
+ * In this case, we need to always do reconstruct-write, to ensure
+ * that in case of drive failure or read-error correction, we
+ * generate correct data from the parity.
+ */
+ if (conf->max_degraded == 2 ||
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+ /* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
+ pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
+ conf->max_degraded, (unsigned long long)recovery_cp,
+ (unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
@@ -2932,7 +2978,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -3084,7 +3130,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
*/
}
} else {
- conf->mddev->resync_mismatches += STRIPE_SECTORS;
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@@ -3459,10 +3505,12 @@ static void handle_stripe(struct stripe_head *sh)
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
- && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
- && test_bit(R5_UPTODATE, &qdev->flags)))))
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
@@ -3489,9 +3537,11 @@ static void handle_stripe(struct stripe_head *sh)
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
*/
- BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
BUG_ON(sh->qd_idx >= 0 &&
- !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+ !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -4072,6 +4122,88 @@ static void release_stripe_plug(struct mddev *mddev,
release_stripe(sh);
}
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+ int remaining;
+ int stripe_sectors;
+
+ if (mddev->reshape_position != MaxSector)
+ /* Skip discard while reshape is happening */
+ return;
+
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+ stripe_sectors = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+ stripe_sectors);
+ sector_div(last_sector, stripe_sectors);
+
+ logical_sector *= conf->chunk_sectors;
+ last_sector *= conf->chunk_sectors;
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS) {
+ DEFINE_WAIT(w);
+ int d;
+ again:
+ sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock_irq(&sh->stripe_lock);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ if (sh->dev[d].towrite || sh->dev[d].toread) {
+ set_bit(R5_Overlap, &sh->dev[d].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ sh->dev[d].towrite = bi;
+ set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+ raid5_inc_bi_active_stripes(bi);
+ }
+ spin_unlock_irq(&sh->stripe_lock);
+ if (conf->mddev->bitmap) {
+ for (d = 0;
+ d < conf->raid_disks - conf->max_degraded;
+ d++)
+ bitmap_startwrite(mddev->bitmap,
+ sh->sector,
+ STRIPE_SECTORS,
+ 0);
+ sh->bm_seq = conf->seq_flush + 1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe_plug(mddev, sh);
+ }
+
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+ md_write_end(mddev);
+ bio_endio(bi, 0);
+ }
+}
+
static void make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
@@ -4094,6 +4226,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
chunk_aligned_read(mddev,bi))
return;
+ if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+ make_discard_request(mddev, bi);
+ return;
+ }
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
@@ -4630,8 +4767,9 @@ static int handle_active_stripes(struct r5conf *conf)
* During the scan, completed stripes are saved for us by the interrupt
* handler, so that they will not have to wait for our next wakeup.
*/
-static void raid5d(struct mddev *mddev)
+static void raid5d(struct md_thread *thread)
{
+ struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
int handled;
struct blk_plug plug;
@@ -5366,6 +5504,7 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
int chunk_size;
+ bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -5385,13 +5524,48 @@ static int run(struct mddev *mddev)
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = stripe * PAGE_SIZE;
+ mddev->queue->limits.discard_alignment = stripe;
+ mddev->queue->limits.discard_granularity = stripe;
+ /*
+ * unaligned part of discard request will be ignored, so can't
+ * guarantee discard_zerors_data
+ */
+ mddev->queue->limits.discard_zeroes_data = 0;
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
+ /*
+ * discard_zeroes_data is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ */
+ if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+ !bdev_get_queue(rdev->bdev)->
+ limits.discard_zeroes_data)
+ discard_supported = false;
}
+
+ if (discard_supported &&
+ mddev->queue->limits.max_discard_sectors >= stripe &&
+ mddev->queue->limits.discard_granularity >= stripe)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
}
return 0;
@@ -5702,7 +5876,8 @@ static int check_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev))
return -ENOSPC;
- return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+ return resize_stripes(conf, (conf->previous_raid_disks
+ + mddev->delta_disks));
}
static int raid5_start_reshape(struct mddev *mddev)