From 8932c2e0dcae52e73430878fd8a7a7800176eada Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:36 -0700 Subject: [PATCH] md: remove arbitrary limit on chunk size The largest chunk size the code can support without substantial surgery is 2^30 bytes, so make that the limit instead of an arbitrary 4Meg. Some day, the 'chunksize' should change to a sector-shift instead of a byte-count. Then no limit would be needed. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index e2df61f5b09a..db2ca2d9066e 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -40,7 +40,8 @@ typedef struct mdk_rdev_s mdk_rdev_t; * options passed in raidrun: */ -#define MAX_CHUNK_SIZE (4096*1024) +/* Currently this must fix in an 'int' */ +#define MAX_CHUNK_SIZE (1<<30) /* * MD's 'extended' device -- cgit v1.2.3 From 16a53ecc35f2a80dc285be2e769768847d89ca37 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:38 -0700 Subject: [PATCH] md: merge raid5 and raid6 code There is a lot of commonality between raid5.c and raid6main.c. This patches merges both into one module called raid456. This saves a lot of code, and paves the way for online raid5->raid6 migrations. There is still duplication, e.g. between handle_stripe5 and handle_stripe6. This will probably be cleaned up later. Cc: "H. Peter Anvin" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 914af667044f..20ed4c997636 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -212,6 +212,7 @@ struct raid5_private_data { mddev_t *mddev; struct disk_info *spare; int chunk_size, level, algorithm; + int max_degraded; int raid_disks, working_disks, failed_disks; int max_nr_stripes; -- cgit v1.2.3 From 5fd6c1dce06ec24ef3de20fe0c7ecf2ba9fe5ef9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:40 -0700 Subject: [PATCH] md: allow checkpoint of recovery with version-1 superblock For a while we have had checkpointing of resync. The version-1 superblock allows recovery to be checkpointed as well, and this patch implements that. Due to early carelessness we need to add a feature flag to signal that the recovery_offset field is in use, otherwise older kernels would assume that a partially recovered array is in fact fully recovered. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 6 ++++++ include/linux/raid/md_p.h | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index db2ca2d9066e..682574f3bd36 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -88,6 +88,10 @@ struct mdk_rdev_s * array and could again if we did a partial * resync from the bitmap */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that @@ -183,6 +187,8 @@ struct mddev_s #define MD_RECOVERY_REQUESTED 6 #define MD_RECOVERY_CHECK 7 #define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + unsigned long recovery; int in_sync; /* know to not need resync */ diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index f1fbae7e390e..b6ebc69bae54 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -265,9 +265,12 @@ struct mdp_superblock_1 { /* feature_map bits */ #define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ #define MD_FEATURE_RESHAPE_ACTIVE 4 -#define MD_FEATURE_ALL 5 +#define MD_FEATURE_ALL (1|2|4) #endif -- cgit v1.2.3 From 7c7546ccf6463edbeee8d9aac6de7be1cd80d08a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:41 -0700 Subject: [PATCH] md: allow a linear array to have drives added while active Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/linear.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index 7eaf290e10e7..ba15469daf11 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h @@ -13,8 +13,10 @@ typedef struct dev_info dev_info_t; struct linear_private_data { + struct linear_private_data *prev; /* earlier version */ dev_info_t **hash_table; sector_t hash_spacing; + sector_t array_size; int preshift; /* shift before dividing by hash_spacing */ dev_info_t disks[0]; }; -- cgit v1.2.3 From c93983bf517c100a31e40ef087e19bd3d7aa2d28 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:41 -0700 Subject: [PATCH] md: support stripe/offset mode in raid10 The "industry standard" DDF format allows for a stripe/offset layout where data is duplicated on different stripes. e.g. A B C D D A B C E F G H H E F G (columns are drives, rows are stripes, LETTERS are chunks of data). This is similar to raid10's 'far' mode, but not quite the same. So enhance 'far' mode with a 'far/offset' option which follows the layout of DDFs stripe/offset. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid10.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h index b1103298a8c2..c41e56a7c090 100644 --- a/include/linux/raid/raid10.h +++ b/include/linux/raid/raid10.h @@ -24,11 +24,16 @@ struct r10_private_data_s { int far_copies; /* number of copies layed out * at large strides across drives */ + int far_offset; /* far_copies are offset by 1 stripe + * instead of many + */ int copies; /* near_copies * far_copies. * must be <= raid_disks */ sector_t stride; /* distance between far copies. - * This is size / far_copies + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. */ int chunk_shift; /* shift from chunks to sectors */ -- cgit v1.2.3 From 5e56341d029f0c2cf31e78dc01d4c861ba4d6a5e Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 26 Jun 2006 00:27:42 -0700 Subject: [PATCH] md: make md_print_devices() static This patch makes the needlessly global md_print_devices() static. Signed-off-by: Adrian Bunk Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 66b44e5e0d6e..eb3e547c8fee 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -85,8 +85,6 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); extern void md_unplug_mddev(mddev_t *mddev); -extern void md_print_devices (void); - extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); @@ -97,7 +95,5 @@ extern void md_new_event(mddev_t *mddev); extern void md_update_sb(mddev_t * mddev); -#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } - #endif -- cgit v1.2.3 From 0b79ccf0cdd9f59e5f99017e1a5d23da336544b2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:44 -0700 Subject: [PATCH] md/bitmap: remove bitmap writeback daemon md/bitmap currently has a separate thread to wait for writes to the bitmap file to complete (as we cannot get a callback on that action). However this isn't needed as bitmap_unplug is called from process context and waits for the writeback thread to do it's work. The same result can be achieved by doing the waiting directly in bitmap_unplug. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 899437802aea..9c8907ca60a7 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -244,13 +244,7 @@ struct bitmap { unsigned long daemon_lastrun; /* jiffies of last run */ unsigned long daemon_sleep; /* how many seconds between updates? */ - /* - * bitmap_writeback_daemon waits for file-pages that have been written, - * as there is no way to get a call-back when a page write completes. - */ - mdk_thread_t *writeback_daemon; spinlock_t write_lock; - wait_queue_head_t write_wait; struct list_head complete_pages; mempool_t *write_pool; }; -- cgit v1.2.3 From d785a06a0b9d0cd86b3cc1bf8e236e62af7b47ed Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:48 -0700 Subject: [PATCH] md/bitmap: change md/bitmap file handling to use bmap to file blocks If md is asked to store a bitmap in a file, it tries to hold onto the page cache pages for that file, manipulate them directly, and call a cocktail of operations to write the file out. I don't believe this is a supportable approach. This patch changes the approach to use the same approach as swap files. i.e. bmap is used to enumerate all the block address of parts of the file and we write directly to those blocks of the device. swapfile only uses parts of the file that provide a full pages at contiguous addresses. We don't have that luxury so we have to cope with pages that are non-contiguous in storage. To handle this we attach buffers to each page, and store the addresses in those buffers. With this approach the pagecache may contain data which is inconsistent with what is on disk. To alleviate the problems this can cause, md invalidates the pagecache when releasing the file. If the file is to be examined while the array is active (a non-critical but occasionally useful function), O_DIRECT io must be used. And new version of mdadm will have support for this. This approach simplifies a lot of code: - we no longer need to keep a list of pages which we need to wait for, as the b_endio function can keep track of how many outstanding writes there are. This saves a mempool. - -EAGAIN returns from write_page are no longer possible (not sure if they ever were actually). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 9c8907ca60a7..63df898fe2e9 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -140,6 +140,7 @@ typedef __u16 bitmap_counter_t; enum bitmap_state { BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ BITMAP_HOSTENDIAN = 0x8000, }; @@ -244,9 +245,9 @@ struct bitmap { unsigned long daemon_lastrun; /* jiffies of last run */ unsigned long daemon_sleep; /* how many seconds between updates? */ - spinlock_t write_lock; - struct list_head complete_pages; - mempool_t *write_pool; + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + }; /* the bitmap API */ -- cgit v1.2.3 From 42543769142d2375f2b5f8fc9cac999f84bd4c4c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Jun 2006 00:27:57 -0700 Subject: [PATCH] md: Don't write dirty/clean update to spares - leave them alone - record the 'event' count on each individual device (they might sometimes be slightly different now) - add a new value for 'sb_dirty': '3' means that the super block only needs to be updated to record a clean<->dirty transition. - Prefer odd event numbers for dirty states and even numbers for clean states - Using all the above, don't update the superblock on a spare device if the update is just doing a clean-dirty transition. To accomodate this, a transition from dirty back to clean might now decrement the events counter if nothing else has changed. The net effect of this is that spare drives will not see any IO requests during normal running of the array, so they can go to sleep if that is what they want to do. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 682574f3bd36..c1e0ac55bab5 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -58,6 +58,7 @@ struct mdk_rdev_s struct page *sb_page; int sb_loaded; + __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t sb_offset; int sb_size; /* bytes in the superblock */ -- cgit v1.2.3