From 4e5314b56a7ea11c7a5f2b8418992b2f49648a25 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:22 -0800 Subject: [PATCH] md: better handling of readerrors with raid5. This patch changes the behaviour of raid5 when it gets a read error. Instead of just failing the device, it tried to find out what should have been there, and writes it over the bad block. For some media-errors, this has a reasonable chance of fixing the error. If the write succeeds, and a subsequent read succeeds as well, raid5 decided the address is OK and conitnues. Instead of failing a drive on read-error, we attempt to re-write the block, and then re-read. If that all works, we allow the device to remain in the array. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 176fc653c284..f025ba6fb14c 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -154,6 +154,8 @@ struct stripe_head { #define R5_Wantwrite 5 #define R5_Syncio 6 /* this io need to be accounted as resync io */ #define R5_Overlap 7 /* There is a pending overlapping request on this block */ +#define R5_ReadError 8 /* seen a read error here recently */ +#define R5_ReWrite 9 /* have tried to over-write the readerror */ /* * Write method -- cgit v1.2.3 From eae1701fbd264cfc7efbaf7cd4cd999760070e27 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:23 -0800 Subject: [PATCH] md: initial sysfs support for md Start using kobjects in mddevs, and provide a couple of simple attributes (level and disks). Attributes live in /sys/block/mdX/md/attr-name Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index ebce949b1443..a9b0e47a3d04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -148,6 +148,8 @@ struct mddev_s struct gendisk *gendisk; + struct kobject kobj; + /* Superblock information */ int major_version, minor_version, -- cgit v1.2.3 From 86e6ffdd243a06663713e637ee683fb27dce8e0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:24 -0800 Subject: [PATCH] md: extend md sysfs support to component devices. Each device in an md array how has a corresponding /sys/block/mdX/md/devNN/ directory which can contain attributes. Currently there is only 'state' which summarises the state, nd 'super' which has a copy of the superblock, and 'block' which is a symlink to the block device. Also, /sys/block/mdX/md/rdNN represents slot 'NN' in the array, and is a symlink to the relevant 'devNN'. Obviously spare devices do not have a slot in the array, and so don't have such a symlink. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index a9b0e47a3d04..d1dad32ebe07 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -105,6 +105,8 @@ struct mdk_rdev_s int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ + struct kobject kobj; + /* A device can be in one of three states based on two flags: * Not working: faulty==1 in_sync==0 * Fully working: faulty==0 in_sync==1 -- cgit v1.2.3 From 3f294f4fb6f2ba887b717674da26c21f3d57f3fc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:25 -0800 Subject: [PATCH] md: add kobject/sysfs support to raid5 /sys/block/mdX/md/raid5/ contains raid5-related attributes. Currently stripe_cache_size is number of entries in stripe cache, and is settable. stripe_cache_active is number of active entries, and in only readable. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f025ba6fb14c..5f4e945c8083 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -228,6 +228,7 @@ struct raid5_private_data { * Cleared when a sync completes. */ + struct kobject kobj; /* * Free stripes pool */ -- cgit v1.2.3 From 24dd469d728dae07f40c5d79ea6dedd38cdf1a30 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:26 -0800 Subject: [PATCH] md: allow a manual resync with md You can trigger a 'check' with echo check > /sys/block/mdX/md/scan_mode or a check-and-repair errors with echo repair > /sys/block/mdX/md/scan_mode and read the current state from the same file. Note: personalities need to know the different between 'check' and 'repair', but don't yet. Until they do, 'check' will be the same as 'repair' and will just do a normal resync pass. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index d1dad32ebe07..efd04dca0abd 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -182,6 +182,8 @@ struct mddev_s * ERR: and IO error was detected - abort the resync/recovery * INTR: someone requested a (clean) early abort. * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for for check-only, no repair */ #define MD_RECOVERY_RUNNING 0 #define MD_RECOVERY_SYNC 1 @@ -189,6 +191,8 @@ struct mddev_s #define MD_RECOVERY_INTR 3 #define MD_RECOVERY_DONE 4 #define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 unsigned long recovery; int in_sync; /* know to not need resync */ -- cgit v1.2.3 From 9d88883e68f404d5581bd391713ceef470ea53a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:26 -0800 Subject: [PATCH] md: teach raid5 the difference between 'check' and 'repair'. With this, raid5 can be asked to check parity without repairing it. It also keeps a count of the number of incorrect parity blocks found (mismatches) and reports them through sysfs. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index efd04dca0abd..cb8b44d1588b 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -175,6 +175,10 @@ struct mddev_s sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ /* recovery/resync flags * NEEDED: we might need to start a resync/recover * RUNNING: a thread is running, or about to be started -- cgit v1.2.3 From 007583c9253fed363a0bd71b039e9b40a0f6855e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:30 -0800 Subject: [PATCH] md: change raid5 sysfs attribute to not create a new directory There isn't really a need for raid5 attributes to be an a subdirectory, so this patch moves them from /sys/block/mdX/md/raid5/attribute to /sys/block/mdX/md/attribute This suggests that all md personalities should co-operate about namespace usage, but that shouldn't be a problem. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 7 +++++++ include/linux/raid/raid5.h | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index cb8b44d1588b..4169c11e5451 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -282,6 +282,13 @@ struct mdk_personality_s }; +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + + static inline char * mdname (mddev_t * mddev) { return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 5f4e945c8083..f025ba6fb14c 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -228,7 +228,6 @@ struct raid5_private_data { * Cleared when a sync completes. */ - struct kobject kobj; /* * Free stripes pool */ -- cgit v1.2.3 From ba22dcbf106338a5c46d6979f9b19564faae3d49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: improvements to raid5 handling of read errors Two refinements to the 'attempt-overwrite-on-read-error' mechanism. 1/ If the array is read-only, don't attempt an over-write. 2/ If there are more than max_nr_stripes read errors on a device with no success, fail the drive. This will make sure a dead drive will be eventually kicked even when we aren't trying to rewrite (which would normally kick a dead drive more quickly. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4169c11e5451..200c69e34fc0 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -134,6 +134,9 @@ struct mdk_rdev_s * only maintained for arrays that * support hot removal */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ }; typedef struct mdk_personality_s mdk_personality_t; -- cgit v1.2.3 From b2d444d7ad975d555bb919601bcdc0e58975a40e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: convert 'faulty' and 'in_sync' fields to bits in 'flags' field This has the advantage of removing the confusion caused by 'rdev_t' and 'mddev_t' both having 'in_sync' fields. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 200c69e34fc0..11629f92180a 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -117,10 +117,10 @@ struct mdk_rdev_s * It can never have faulty==1, in_sync==1 * This reduces the burden of testing multiple flags in many cases */ - int faulty; /* if faulty do not issue IO requests */ - int in_sync; /* device is a full member of the array */ - unsigned long flags; /* Should include faulty and in_sync here. */ + unsigned long flags; +#define Faulty 1 /* device is known to have a fault */ +#define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ int desc_nr; /* descriptor index in the superblock */ @@ -247,7 +247,7 @@ struct mddev_s static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) { - int faulty = rdev->faulty; + int faulty = test_bit(Faulty, &rdev->flags); if (atomic_dec_and_test(&rdev->nr_pending) && faulty) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } -- cgit v1.2.3 From bd926c63b7a6843d3ce2728396c0891e54fce5c4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:32 -0800 Subject: [PATCH] md: make md on-disk bitmaps not host-endian Current bitmaps use set_bit et.al and so are host-endian, which means not-portable. Oops. Define a new version number (4) for which bitmaps are little-endian. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 11 +++++++++-- include/linux/raid/md.h | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 9de99198caf1..899437802aea 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -6,7 +6,13 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR 3 +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + #define BITMAP_MINOR 39 /* @@ -133,7 +139,8 @@ typedef __u16 bitmap_counter_t; /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ - BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_HOSTENDIAN = 0x8000, }; /* the superblock at the front of the bitmap file -- little endian */ diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index ffa316ce4dc8..91467a3c4a52 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -66,8 +66,10 @@ * and major_version/minor_version accordingly * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT * in the super status byte + * >=3 means that bitmap superblock version 4 is supported, which uses + * little-ending representation rather than host-endian */ -#define MD_PATCHLEVEL_VERSION 2 +#define MD_PATCHLEVEL_VERSION 3 extern int register_md_personality (int p_num, mdk_personality_t *p); extern int unregister_md_personality (int p_num); -- cgit v1.2.3 From a9701a30470856408d08657eb1bd7ae29a146190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:34 -0800 Subject: [PATCH] md: support BIO_RW_BARRIER for md/raid1 We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 1 + include/linux/raid/md_k.h | 8 ++++++++ include/linux/raid/raid1.h | 4 +++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 91467a3c4a52..13e7c4b62367 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -89,6 +89,7 @@ extern void md_print_devices (void); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 11629f92180a..d5854c2b2721 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -122,6 +122,7 @@ struct mdk_rdev_s #define Faulty 1 /* device is known to have a fault */ #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ +#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ @@ -210,6 +211,13 @@ struct mddev_s int degraded; /* whether md should consider * adding a spare */ + int barriers_work; /* initialised to true, cleared as soon + * as a barrier request to slave + * fails. Only supported + */ + struct bio *biolist; /* bios that need to be retried + * because BIO_RW_BARRIER is not supported + */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index 60e19b667548..292b98f2b408 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h @@ -110,7 +110,9 @@ struct r1bio_s { #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 -#define R1BIO_BehindIO 3 +#define R1BIO_BehindIO 3 +#define R1BIO_Barrier 4 +#define R1BIO_BarrierRetry 5 /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when -- cgit v1.2.3 From 787453c2397edcc3261efebb661739acd8c38547 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:43 -0800 Subject: [PATCH] md: complete conversion of md to use kthreads There are a few loose ends following the conversion of md to use kthreads: - Some fields in mdk_thread_t that aren't needed (kthreads does it's own completion and manages it's own name). - thread->run is now never NULL, so no need to check - Some tests for signal_pending that aren't needed (As we don't use signals to stop threads any more) - Some flush_signals are not needed - Some waits are interruptible and don't need to be. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index d5854c2b2721..46629a275ba9 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -334,10 +334,8 @@ typedef struct mdk_thread_s { mddev_t *mddev; wait_queue_head_t wqueue; unsigned long flags; - struct completion *event; struct task_struct *tsk; unsigned long timeout; - const char *name; } mdk_thread_t; #define THREAD_WAKEUP 0 -- cgit v1.2.3