From 3c462c880b52aae2cfbbb8db8b401eef118cc128 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 19 Aug 2015 07:35:54 +1000 Subject: md: Increment version for clustered bitmaps Add BITMAP_MAJOR_CLUSTERED as 5, in order to prevent older kernels to assemble a clustered device. In order to maximize compatibility, the major version is set to BITMAP_MAJOR_CLUSTERED *only* if the bitmap is clustered. Added MD_FEATURE_CLUSTERED in order to return error for older kernels which would assemble MD even if the bitmap is corrupted. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c702de18207a..1e1bdd86f40c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1735,6 +1735,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) -- cgit v1.2.3 From c40f341f1e7fd4eddcfc5881d94cfa8669071ee6 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 19 Aug 2015 08:14:42 +1000 Subject: md-cluster: Use a small window for resync Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window (32M) is maintained in r1conf as cluster_sync_low and cluster_sync_high and processed in raid1's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Check if the sync will fit in the new window, if not issue a wait_barrier() and set cluster_sync_low to sector_nr. 3. Set cluster_sync_high to cluster_sync_low + resync_window. 4. Send a message to all nodes so they may add it in their suspension list. bitmap_cond_end_sync is modified to allow to force a sync inorder to get the curr_resync_completed uptodate with the sector passed. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1e1bdd86f40c..9798a9921a38 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7805,9 +7805,6 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start(mddev, j, max_sectors); - blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7871,8 +7868,6 @@ void md_do_sync(struct md_thread *thread) j = max_sectors; if (j > 2) mddev->curr_resync = j; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7979,9 +7974,6 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_finish(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); spin_lock(&mddev->lock); -- cgit v1.2.3 From 2910ff17d154baa5eb50e362a91104e831eb2bb6 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 28 Sep 2015 10:27:26 -0500 Subject: md: remove_and_add_spares() to activate specific rdev remove_and_add_spares() checks for all devices to activate spare. Change it to activate a specific device if a non-null rdev argument is passed. remove_and_add_spares() can be used to activate spares in slot_store() as well. For hot_remove_disk(), check if rdev->raid_disk == -1 before calling remove_and_add_spares() Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9798a9921a38..e21a2feed826 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2691,15 +2691,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk == -1) + return -EBUSY; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -6004,12 +5998,16 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (mddev_is_clustered(mddev)) md_cluster_ops->metadata_update_start(mddev); + if (rdev->raid_disk < 0) + goto kick_rdev; + clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; +kick_rdev: if (mddev_is_clustered(mddev)) md_cluster_ops->remove_disk(mddev, rdev); @@ -6024,6 +6022,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) busy: if (mddev_is_clustered(mddev)) md_cluster_ops->metadata_update_cancel(mddev); + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -8018,10 +8017,12 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (this) + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) -- cgit v1.2.3 From 70bcecdb1534a7dcd82503b705c27a048d568c9d Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 21 Aug 2015 10:33:39 -0500 Subject: md-cluster: Improve md_reload_sb to be less error prone md_reload_sb is too simplistic and it explicitly needs to determine the changes made by the writing node. However, there are multiple areas where a simple reload could fail. Instead, read the superblock of one of the "good" rdevs and update the necessary information: - read the superblock into a newly allocated page, by temporarily swapping out rdev->sb_page and calling ->load_super. - if that fails return - if it succeeds, call check_sb_changes 1. iterates over list of active devices and checks the matching dev_roles[] value. If that is 'faulty', the device must be marked as faulty - call md_error to mark the device as faulty. Make sure not to set CHANGE_DEVS and wakeup mddev->thread or else it would initiate a resync process, which is the responsibility of the "primary" node. - clear the Blocked bit - Call remove_and_add_spares() to hot remove the device. If the device is 'spare': - call remove_and_add_spares() to get the number of spares added in this operation. - Reduce mddev->degraded to mark the array as not degraded. 2. reset recovery_cp - read the rest of the rdevs to update recovery_offset. If recovery_offset is equal to MaxSector, call spare_active() to set it In_sync This required that recovery_offset be initialized to MaxSector, as opposed to zero so as to communicate the end of sync for a rdev. Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 107 insertions(+), 14 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index e21a2feed826..12cc28ab9a41 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8924,25 +8924,118 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + /* recovery_cp changed */ + if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp) + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; } + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; + } + + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); -- cgit v1.2.3 From 2aa82191ac36cd2f2a41aa25697db30ed7c619ef Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 28 Sep 2015 19:21:35 -0500 Subject: md-cluster: Perform a lazy update In a clustered environment, a change such as marking a device faulty, can be recorded by any of the nodes. This is communicated to all the nodes and re-recording such a change is unnecessary, and quite often pretty disruptive. With this patch, just before the update, we detect for the changes and if the changes are already in superblock, we abort the update after clearing all the flags Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 101 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 44 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 12cc28ab9a41..5f0967803dc7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2199,6 +2199,46 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; @@ -2211,6 +2251,18 @@ void md_update_sb(struct mddev *mddev, int force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { @@ -2359,6 +2411,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2496,13 +2551,9 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (mddev_is_clustered(mddev)) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (mddev->pers) md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); err = 0; } } else if (cmd_match(buf, "writemostly")) { @@ -4063,12 +4114,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -5306,8 +5353,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5326,8 +5371,6 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -6015,9 +6058,6 @@ kick_rdev: md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - return 0; busy: if (mddev_is_clustered(mddev)) @@ -6073,14 +6113,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) goto abort_export; } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_clustered; + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -6090,9 +6128,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; md_update_sb(mddev, 1); - - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6102,9 +6137,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; -abort_clustered: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6422,8 +6454,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6481,12 +6511,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); return rv; err: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -7599,11 +7625,7 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -8182,13 +8204,8 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8286,8 +8303,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8300,8 +8315,6 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); -- cgit v1.2.3 From c186b128cda5a246da25f474e4689cb2bfacfcac Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 30 Sep 2015 13:20:35 -0500 Subject: md-cluster: Perform resync/recovery under a DLM lock Resync or recovery must be performed by only one node at a time. A DLM lock resource, resync_lockres provides the mutual exclusion so that only one node performs the recovery/resync at a time. If a node is unable to get the resync_lockres, because recovery is being performed by another node, it set MD_RECOVER_NEEDED so as to schedule recovery in the future. Remove the debug message in resync_info_update() used during development. Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5f0967803dc7..61e897def04f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7657,6 +7657,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7959,7 +7960,11 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality that we are finished */ + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { + md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -7997,6 +8002,11 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -8078,14 +8088,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); -- cgit v1.2.3 From dbb64f8635f5d68192108b88759a34633a4bd558 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 1 Oct 2015 13:20:27 -0500 Subject: md-cluster: Fix adding of new disk with new reload code Adding the disk worked incorrectly with the new reload code. Fix it: - No operation should be performed on rdev marked as Candidate - After a metadata update operation, kick disk if role is 0xfffe else clear Candidate bit and continue with the regular change check. - Saving the mode of the lock resource to check if token lock is already locked, because it can be called twice while adding a disk. However, unlock_comm() must be called only once. - add_new_disk() is called by the node initiating the --add operation. If it needs to be canceled, call add_new_disk_cancel(). The operation is completed by md_update_sb() which will write and unlock the communication. Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 61e897def04f..8a6f67f55d3d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3246,14 +3246,6 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); continue; } - /* No device should have a Candidate flag - * when reading devices - */ - if (test_bit(Candidate, &rdev->flags)) { - pr_info("md: kicking Cluster Candidate %s from array!\n", - bdevname(rdev->bdev, b)); - md_kick_rdev_from_array(rdev); - } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; @@ -5950,19 +5942,12 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) { - /* Through --cluster-confirm */ + if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); - err = md_cluster_ops->new_disk_ack(mddev, true); - if (err) { - export_rdev(rdev); - return err; - } - } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk_start(mddev, rdev); + err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - md_cluster_ops->add_new_disk_finish(mddev); export_rdev(rdev); return err; } @@ -5971,13 +5956,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (err) export_rdev(rdev); - else + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + md_cluster_ops->new_disk_ack(mddev, (err == 0)); + else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) err = add_bound_rdev(rdev); - if (mddev_is_clustered(mddev) && - (info->state & (1 << MD_DISK_CLUSTER_ADD))) - md_cluster_ops->add_new_disk_finish(mddev); + return err; } @@ -8055,6 +8050,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; + if (test_bit(Candidate, &rdev->flags)) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -8972,6 +8969,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == 0xfffe) { + pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + if (role != rdev2->raid_disk) { /* got activated */ if (rdev2->raid_disk == -1 && role != 0xffff) { -- cgit v1.2.3 From a9720903d1415317e18f439917f760ec592f3e3b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 12 Oct 2015 17:21:27 +0800 Subject: md-cluster: only call kick_rdev_from_array after remove disk successfully For cluster raid, we should not kick it from array if the disk can't be remove from array successfully. Signed-off-by: Guoqing Jiang Signed-off-by: Goldwyn Rodrigues --- drivers/md/md.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8a6f67f55d3d..d39a72aec316 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2548,13 +2548,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); - md_kick_rdev_from_array(rdev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); -- cgit v1.2.3 From 23b63f9fa82eed128b5c585cbfe10ced82d73e91 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 12 Oct 2015 17:21:30 +0800 Subject: md: check the return value for metadata_update_start We shouldn't run related funs of md_cluster_ops in case metadata_update_start returned failure. Signed-off-by: Guoqing Jiang --- drivers/md/md.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d39a72aec316..a71b36f0acb0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2245,6 +2245,7 @@ void md_update_sb(struct mddev *mddev, int force_change) int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) @@ -2255,10 +2256,11 @@ void md_update_sb(struct mddev *mddev, int force_change) if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { - md_cluster_ops->metadata_update_cancel(mddev); + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); clear_bit(MD_CHANGE_PENDING, &mddev->flags); return; } @@ -2412,7 +2414,7 @@ repeat: wake_up(&rdev->blocked_wait); } - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -6031,13 +6033,14 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); if (rdev->raid_disk < 0) goto kick_rdev; @@ -6049,7 +6052,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); @@ -6058,7 +6061,7 @@ kick_rdev: return 0; busy: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_cancel(mddev); printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", -- cgit v1.2.3 From 28c1b9fdf4562b52fe104384b16238c39c8a8d40 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 22 Oct 2015 16:01:25 +1100 Subject: md-cluster: Call update_raid_disks() if another node --grow's raid_disks To incorporate --grow feature executed on one node, other nodes need to acknowledge the change in number of disks. Call update_raid_disks() to update internal data structures. This leads to call check_reshape() -> md_allow_write() -> md_update_sb(), this results in a deadlock. This is done so it can safely allocate memory (which might trigger writeback which might write to raid1). This is not required for md with a bitmap. In the clustered case, we don't perform md_update_sb() in md_allow_write(), but in do_md_run(). Also we disable safemode for clustered mode. mddev->recovery_cp need not be set in check_sb_changes() because this is required only when a node reads another node's bitmap. mddev->recovery_cp (which is read from sb->resync_offset), is set only if mddev is in_sync. Since we disabled safemode, in_sync is set to zero. In a clustered environment, the MD may not be in sync because another node could be writing to it. So make sure that in_sync is not set in case of clustered node in __md_stop_writes(). Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index a71b36f0acb0..44d034246723 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2230,7 +2230,6 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) || (mddev->layout != le64_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) @@ -3314,6 +3313,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; + if (mddev_is_clustered(mddev)) { + pr_info("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; if (msec == 0) @@ -5224,7 +5228,10 @@ int md_run(struct mddev *mddev) atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -5267,6 +5274,9 @@ static int do_md_run(struct mddev *mddev) goto out; } + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -5363,9 +5373,11 @@ static void __md_stop_writes(struct mddev *mddev) md_super_wait(mddev); if (mddev->ro == 0 && - (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + (mddev->flags & MD_UPDATE_SB_FLAGS))) { /* mark array as shutdown cleanly */ - mddev->in_sync = 1; + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; md_update_sb(mddev, 1); } } @@ -9007,9 +9019,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } } - /* recovery_cp changed */ - if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp) - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) + update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); -- cgit v1.2.3 From c4d4c91b44d8309082127893221a1971a27c50ca Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 13 Aug 2015 14:31:54 -0700 Subject: MD: replace special disk roles with macros Add the following two macros for special roles: spare and faulty MD_DISK_ROLE_SPARE 0xffff MD_DISK_ROLE_FAULTY 0xfffe Add MD_DISK_ROLE_MAX 0xff00 as the maximal possible regular role, and minimal value of special role. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 44d034246723..cfe5c8704a26 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1608,7 +1608,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { @@ -1628,14 +1628,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; + role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { - case 0xffff: /* spare */ + case MD_DISK_ROLE_SPARE: /* spare */ break; - case 0xfffe: /* faulty */ + case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; default: @@ -1788,18 +1788,18 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; idev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); -- cgit v1.2.3 From bac624f3f86a8c7db395c7f85ccad6a504b9c4b4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 13 Aug 2015 14:31:55 -0700 Subject: MD: add a new disk role to present write journal device Next patches will use a disk as raid5/6 journaling. We need a new disk role to present the journal device and add MD_FEATURE_JOURNAL to feature_map for backward compability. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index cfe5c8704a26..391341a772c7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1638,6 +1638,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + printk(KERN_WARNING + "md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & @@ -1796,7 +1805,10 @@ retry: sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0) + else if (test_bit(Journal, &rdev2->flags)) { + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); + } else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); @@ -5840,7 +5852,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<flags)) + info.state |= (1<flags)) info.state |= (1<flags); + if (info->state & (1<flags); /* * check whether the device shows up in other nodes */ @@ -7330,6 +7345,10 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(F)"); continue; } + if (test_bit(Journal, &rdev->flags)) { + seq_printf(seq, "(J)"); + continue; + } if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ if (test_bit(Replacement, &rdev->flags)) -- cgit v1.2.3 From 3069aa8def32b0c2b83cd27d1c37ed30b47ce879 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Aug 2015 14:31:56 -0700 Subject: md: override md superblock recovery_offset for journal device Journal device stores data in a log structure. We need record the log start. Here we override md superblock recovery_offset for this purpose. This field of a journal device is meaningless otherwise. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 391341a772c7..3592beb6931e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1646,6 +1646,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) return -EINVAL; } set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); break; default: rdev->saved_raid_disk = role; @@ -1721,6 +1722,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -8097,6 +8101,8 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; + if (test_bit(Journal, &rdev->flags)) + continue; if (mddev->ro && ! (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))) -- cgit v1.2.3 From bd18f6462f3d167a9b3ec27851c98f82694b2adf Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 2 Sep 2015 13:49:50 -0700 Subject: md: skip resync for raid array with journal If a raid array has journal, the journal can guarantee the consistency, we can skip resync after a unclean shutdown. The exception is raid creation or user initiated resync, which we still do a raid resync. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3592beb6931e..89149acd8a5e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1647,6 +1647,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); break; default: rdev->saved_raid_disk = role; @@ -1689,6 +1691,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); -- cgit v1.2.3 From 0b020e85bdd5765aac2440848e7a927069f5f83c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 3 Sep 2015 23:00:35 -0700 Subject: skip match_mddev_units check for special roles match_mddev_units is used to check whether 2 RAID arrays share same disk(s). Arrays that share disk(s) will not do resync at the same time for better performance (fewer HDD seek). However, this check should not apply to Spare, Faulty, and Journal disks, as they do not paticipate in resync. In this patch, match_mddev_units skips check for disks with flag "Faulty" or "Journal" or raid_disk < 0. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 89149acd8a5e..fe67272d0b1b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1935,13 +1935,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } -- cgit v1.2.3 From ac6096e9d5cb88a31f3af2d140df7d680b42745e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 4 Oct 2015 09:20:11 -0700 Subject: md: show journal for journal disk in disk state sysfs Journal disk state sysfs entry should indicate it's journal Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index fe67272d0b1b..5744829b7d05 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2520,6 +2520,10 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(Journal, &flags)) { + len += sprintf(page+len, "%sjournal",sep); + sep = ","; + } if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; -- cgit v1.2.3 From 9efdca16e0182eca489a519f576019fd9c0c1b25 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 12 Oct 2015 16:59:50 -0700 Subject: MD: fix info output for journal disk journal disk can be faulty. The Journal and Faulty aren't exclusive with each other. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5744829b7d05..e4e2731f7660 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5874,7 +5874,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<flags)) + } + if (test_bit(Journal, &rdev->flags)) info.state |= (1<flags)) info.state |= (1<bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; } - if (test_bit(Journal, &rdev->flags)) { - seq_printf(seq, "(J)"); - continue; - } if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ if (test_bit(Replacement, &rdev->flags)) -- cgit v1.2.3 From a97b7896447a89749d9258fbb9d8c3faf48a7a4e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 8 Oct 2015 21:54:09 -0700 Subject: MD: add new bit to indicate raid array with journal If a raid array has journal feature bit set, add a new bit to indicate this. If the array is started without journal disk existing, we know there is something wrong. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index e4e2731f7660..bca859a6e3fd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1667,6 +1667,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1807,16 +1809,18 @@ retry: for (i=0; idev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (test_bit(Journal, &rdev2->flags)) { + else if (test_bit(Journal, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); - sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); - } else if (rdev2->raid_disk >= 0) + else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); -- cgit v1.2.3 From a3dfbdaadba2612faf11f025b8156c36e3700247 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 8 Oct 2015 21:54:11 -0700 Subject: MD: kick out journal disk if it's not fresh When journal disk is faulty and we are reassemabling the raid array, the journal disk is old. We don't allow the journal disk added to the raid array. Since journal disk is missing in the array, the raid5 will mark the array readonly. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bca859a6e3fd..f67cd5b68771 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { -- cgit v1.2.3 From f2076e7d0643d15b11db979acc7cffd2e8d69e77 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 8 Oct 2015 21:54:12 -0700 Subject: MD: set journal disk ->raid_disk Set journal disk ->raid_disk to >=0, I choose raid_disks + 1 instead of 0, because we already have a disk with ->raid_disk 0 and this causes sysfs entry creation conflict. A lot of places assumes disk with ->raid_disk >=0 is normal raid disk, so we add check for journal disk. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index f67cd5b68771..b5057596b630 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1650,6 +1650,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->journal_tail = le64_to_cpu(sb->journal_tail); if (mddev->recovery_cp == MaxSector) set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + rdev->raid_disk = mddev->raid_disks; break; default: rdev->saved_raid_disk = role; @@ -1719,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -2304,6 +2305,7 @@ repeat: rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -2540,6 +2542,7 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; @@ -2626,7 +2629,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; - } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; @@ -2645,6 +2649,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * check if recovery is needed. */ if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); @@ -2722,7 +2727,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { - if (rdev->raid_disk < 0) + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); @@ -2734,6 +2741,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) int slot; int err; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { @@ -2932,6 +2941,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) sector_t oldsectors = rdev->sectors; sector_t sectors; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) @@ -3294,7 +3305,9 @@ static void analyze_sbs(struct mddev *mddev) rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + } else if (rdev->raid_disk >= + (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -7825,6 +7838,7 @@ void md_do_sync(struct md_thread *thread) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) @@ -8050,6 +8064,7 @@ void md_do_sync(struct md_thread *thread) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) @@ -8095,7 +8110,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && + (!test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags))) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { @@ -8117,6 +8133,7 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; if (rdev->raid_disk >= 0) -- cgit v1.2.3 From 339421def582abb14c2217aa8c8f28bb2e299174 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 8 Oct 2015 21:54:13 -0700 Subject: MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW When RAID-4/5/6 array suffers from missing journal device, we put the array in read only state. We should not allow trasition to read-write states (clean and active) before replacing journal device. Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index b5057596b630..08a4034351de 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3970,7 +3970,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case clean: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { @@ -3988,7 +3990,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case active: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; @@ -5351,6 +5355,25 @@ static int restart_array(struct mddev *mddev) return -EINVAL; if (!mddev->ro) return -EBUSY; + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + struct md_rdev *rdev; + bool has_journal = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + has_journal = true; + break; + } + } + rcu_read_unlock(); + + /* Don't restart rw with journal missing/faulty */ + if (!has_journal) + return -EINVAL; + } + mddev->safemode = 0; mddev->ro = 0; set_disk_ro(disk, 0); -- cgit v1.2.3