From f3b99be19ded511a1bf05a148276239d9f13eefa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Jun 2010 13:31:03 +1000 Subject: Restore partition detection of newly created md arrays. Commit b821eaa572fd737faaf6928ba046e571526c36c6 broke partition detection for md arrays. The logic was almost right. However if revalidate_disk is called when the device is not yet open, bdev->bd_disk won't be set, so the flush_disk() Call will not set bd_invalidated. So when md_open is called we still need to ensure that ->bd_invalidated gets set. This is easily done with a call to check_disk_size_change in the place where the offending commit removed check_disk_change. At the important times, the size will have changed from 0 to non-zero, so check_disk_size_change will set bd_invalidated. Tested-by: Duncan <1i5t5.duncan@cox.net> Reported-by: Duncan <1i5t5.duncan@cox.net> Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 46b3a044eadf..4edcda8f4869 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5895,6 +5895,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); + check_disk_size_change(mddev->gendisk, bdev); out: return err; } -- cgit v1.2.3 From e93f68a1fc6244c05ad8fae28e75835ec74ab34e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 15 Jun 2010 09:36:03 +0100 Subject: md: fix handling of array level takeover that re-arranges devices. Most array level changes leave the list of devices largely unchanged, possibly causing one at the end to become redundant. However conversions between RAID0 and RAID10 need to renumber all devices (except 0). This renumbering is currently being done in the ->run method when the new personality takes over. However this is too late as the common code in md.c might already have invalidated some of the devices if they had a ->raid_disk number that appeared to high. Moving it into the ->takeover method is too early as the array is still active at that time and wrong ->raid_disk numbers could cause confusion. So add a ->new_raid_disk field to mdk_rdev_s and use it to communicate the new raid_disk number. Now the common code knows exactly which devices need to be renumbered, and which can be invalidated, and can do it all at a convenient time when the array is suspend. It can also update some symlinks in sysfs which previously were not be updated correctly. Reported-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 4edcda8f4869..4869128bf742 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3001,6 +3001,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; } + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->new_raid_disk = rdev->raid_disk; + /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ @@ -3051,13 +3054,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->safemode = 0; } - module_put(mddev->pers->owner); - /* Invalidate devices that are now superfluous */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= mddev->raid_disks) { - rdev->raid_disk = -1; + list_for_each_entry(rdev, &mddev->disks, same_set) { + char nm[20]; + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk > mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); + else { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + printk("md: cannot register %s for %s after level change\n", + nm, mdname(mddev)); } + } + + module_put(mddev->pers->owner); mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); -- cgit v1.2.3 From 70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Jun 2010 17:01:25 +1000 Subject: md: Don't update ->recovery_offset when reshaping an array to fewer devices. When an array is reshaped to have fewer devices, the reshape proceeds from the end of the devices to the beginning. If a device happens to be non-In_sync (which is possible but rare) we would normally update the ->recovery_offset as the reshape progresses. However that would be wrong as the recover_offset records that the early part of the device is in_sync, while in fact it would only be the later part that is in_sync, and in any case the offset number would be measured from the wrong end of the device. Relatedly, if after a reshape a spare is discovered to not be recoverred all the way to the end, not allow spare_active to incorporate it in the array. This becomes relevant in the following sample scenario: A 4 drive RAID5 is converted to a 6 drive RAID6 in a combined operation. The RAID5->RAID6 conversion will cause a 5 drive to be included as a spare, then the 5drive -> 6drive reshape will effectively rebuild that spare as it progresses. The 6th drive is treated as in_sync the whole time as there is never any case that we might consider reading from it, but must not because there is no valid data. If we interrupt this reshape part-way through and reverse it to return to a 5-drive RAID6 (or event a 4-drive RAID5), we don't want to update the recovery_offset - as that would be wrong - and we don't want to include that spare as active in the 5-drive RAID6 when the reversed reshape completed and it will be mostly out-of-sync still. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 4869128bf742..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) /* First make sure individual recovery_offsets are correct */ list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -6872,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) -- cgit v1.2.3