From e0a115e5aa554b93150a8dc1c3fe15467708abb2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:52 -0700 Subject: md: fix prexor vs sync_request race During the initial array synchronization process there is a window between when a prexor operation is scheduled to a specific stripe and when it completes for a sync_request to be scheduled to the same stripe. When this happens the prexor completes and the stripe is unconditionally marked "insync", effectively canceling the sync_request for the stripe. Prior to 2.6.23 this was not a problem because the prexor operation was done under sh->lock. The effect in older kernels being that the prexor would still erroneously mark the stripe "insync", but sync_request would be held off and re-mark the stripe as "!in_sync". Change the write completion logic to not mark the stripe "in_sync" if a prexor was performed. The effect of the change is to sometimes not set STRIPE_INSYNC. The worst this can do is cause the resync to stall waiting for STRIPE_INSYNC to be set. If this were happening, then STRIPE_SYNCING would be set and handle_issuing_new_read_requests would cause all available blocks to eventually be read, at which point prexor would never be used on that stripe any more and STRIPE_INSYNC would eventually be set. echo repair > /sys/block/mdN/md/sync_action will correct arrays that may have lost this race. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 425958a76b84..f0f0585c107e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2645,6 +2645,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2774,9 +2775,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2810,6 +2813,8 @@ static void handle_stripe5(struct stripe_head *sh) if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3 From a6d8113a986c66aeb379a26b6e0062488b3e59e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:53 -0700 Subject: md: fix uninitialized use of mddev->recovery_wait If an array was created with --assume-clean we will oops when trying to set ->resync_max. Fix this by initializing ->recovery_wait in mddev_find. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 51c19f86ff99..7cf512a34ccf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -276,6 +276,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); + init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; new->resync_max = MaxSector; new->level = LEVEL_NONE; @@ -5665,7 +5666,6 @@ void md_do_sync(mddev_t *mddev) window/2,(unsigned long long) max_sectors/2); atomic_set(&mddev->recovery_active, 0); - init_waitqueue_head(&mddev->recovery_wait); last_check = 0; if (j>2) { -- cgit v1.2.3 From c337869d95011495fa181536786e74aa2d7ff031 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 5 Jun 2008 22:45:54 -0700 Subject: md: do not compute parity unless it is on a failed drive If a block is computed (rather than read) then a check/repair operation may be lead to believe that the data on disk is correct, when infact it isn't. So only compute blocks for failed devices. This issue has been around since at least 2.6.12, but has become harder to hit in recent kernels since most reads bypass the cache. echo repair > /sys/block/mdN/md/sync_action will set the parity blocks to the correct state. Cc: Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f0f0585c107e..c37e256b1176 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2002,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2087,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); -- cgit v1.2.3