md/raid5: ensure sync and DISCARD don't happen at the same time.

commit f8dfcffd0472a0f353f34a567ad3f53568914d04 upstream. A number of problems can occur due to races between resync/recovery and discard. - if sync_request calls handle_stripe() while a discard is happening on the stripe, it might call handle_stripe_clean_event before all of the individual discard requests have completed (so some devices are still locked, but not all). Since commit ca64cae96037de16e4af92678814f5d4bf0c1c65 md/raid5: Make sure we clear R5_Discard when discard is finished. this will cause R5_Discard to be cleared for the parity device, so handle_stripe_clean_event() will not be called when the other devices do become unlocked, so their ->written will not be cleared. This ultimately leads to a WARN_ON in init_stripe and a lock-up. - If handle_stripe_clean_event() does clear R5_UPTODATE at an awkward time for resync, it can lead to s->uptodate being less than disks in handle_parity_checks5(), which triggers a BUG (because it is one). So: - keep R5_Discard on the parity device until all other devices have completed their discard request - make sure we don't try to have a 'discard' and a 'sync' action at the same time. This involves a new stripe flag to we know when a 'discard' is happening, and the use of R5_Overlap on the parity disk so when a discard is wanted while a sync is active, so we know to wake up the discard at the appropriate time. Discard support for RAID5 was added in 3.7, so this is suitable for any -stable kernel since 3.7. Reported-by: Jes Sorensen <Jes.Sorensen@redhat.com> Tested-by: Jes Sorensen <Jes.Sorensen@redhat.com> Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: NeilBrown <neilb@suse.de> 2013-03-12 12:18:06 +1100
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2013-03-28 12:18:01 -0700
commit: 9b876a370355dc6e88b60151b9d7014ce4519117 (patch)
tree: 1f5e73265e815ad589150954c81809d147884cce
parent: a9e9cfa2ec38992da5db7c60c651e2e8dce7e393 (diff)
2 files changed, 40 insertions, 6 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0d999bb7964e..94ce78ecbf5b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2612,6 +2612,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int i;
 
 	clear_bit(STRIPE_SYNCING, &sh->state);
+	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+		wake_up(&conf->wait_for_overlap);
 	s->syncing = 0;
 	s->replacing = 0;
 	/* There is nothing more to do for sync/check/repair.
@@ -2785,6 +2787,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 {
 	int i;
 	struct r5dev *dev;
+	int discard_pending = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -2813,9 +2816,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 						STRIPE_SECTORS,
 					 !test_bit(STRIPE_DEGRADED, &sh->state),
 						0);
-			}
-		} else if (test_bit(R5_Discard, &sh->dev[i].flags))
-			clear_bit(R5_Discard, &sh->dev[i].flags);
+			} else if (test_bit(R5_Discard, &dev->flags))
+				discard_pending = 1;
+		}
+	if (!discard_pending &&
+	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+		if (sh->qd_idx >= 0) {
+			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+		}
+		/* now that discard is done we can proceed with any sync */
+		clear_bit(STRIPE_DISCARD, &sh->state);
+		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+			set_bit(STRIPE_HANDLE, &sh->state);
+
+	}
 
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
 		if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3467,9 +3484,15 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
-		set_bit(STRIPE_SYNCING, &sh->state);
-		clear_bit(STRIPE_INSYNC, &sh->state);
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+		spin_lock(&sh->stripe_lock);
+		/* Cannot process 'sync' concurrently with 'discard' */
+		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+			set_bit(STRIPE_SYNCING, &sh->state);
+			clear_bit(STRIPE_INSYNC, &sh->state);
+		}
+		spin_unlock(&sh->stripe_lock);
 	}
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
@@ -3629,6 +3652,8 @@ static void handle_stripe(struct stripe_head *sh)
 	    test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+			wake_up(&conf->wait_for_overlap);
 	}
 
 	/* If the failed drives are just a ReadError, then we might need
@@ -4195,6 +4220,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
 		prepare_to_wait(&conf->wait_for_overlap, &w,
 				TASK_UNINTERRUPTIBLE);
+		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+		if (test_bit(STRIPE_SYNCING, &sh->state)) {
+			release_stripe(sh);
+			schedule();
+			goto again;
+		}
+		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
 		spin_lock_irq(&sh->stripe_lock);
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4207,6 +4239,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				goto again;
 			}
 		}
+		set_bit(STRIPE_DISCARD, &sh->state);
 		finish_wait(&conf->wait_for_overlap, &w);
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1fd..050a334e89c1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -323,6 +323,7 @@ enum {
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
 	STRIPE_ON_UNPLUG_LIST,
+	STRIPE_DISCARD,
 };
 
 /*
author	NeilBrown <neilb@suse.de>	2013-03-12 12:18:06 +1100
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2013-03-28 12:18:01 -0700
commit	9b876a370355dc6e88b60151b9d7014ce4519117 (patch)
tree	1f5e73265e815ad589150954c81809d147884cce
parent	a9e9cfa2ec38992da5db7c60c651e2e8dce7e393 (diff)