From 8932c2e0dcae52e73430878fd8a7a7800176eada Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:36 -0700
Subject: [PATCH] md: remove arbitrary limit on chunk size

The largest chunk size the code can support without substantial surgery is
2^30 bytes, so make that the limit instead of an arbitrary 4Meg.  Some day,
the 'chunksize' should change to a sector-shift instead of a byte-count.  Then
no limit would be needed.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e2df61f5b09a..db2ca2d9066e 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -40,7 +40,8 @@ typedef struct mdk_rdev_s mdk_rdev_t;
  * options passed in raidrun:
  */
 
-#define MAX_CHUNK_SIZE (4096*1024)
+/* Currently this must fix in an 'int' */
+#define MAX_CHUNK_SIZE (1<<30)
 
 /*
  * MD's 'extended' device
-- 
cgit v1.2.3


From 16a53ecc35f2a80dc285be2e769768847d89ca37 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:38 -0700
Subject: [PATCH] md: merge raid5 and raid6 code

There is a lot of commonality between raid5.c and raid6main.c.  This patches
merges both into one module called raid456.  This saves a lot of code, and
paves the way for online raid5->raid6 migrations.

There is still duplication, e.g.  between handle_stripe5 and handle_stripe6.
This will probably be cleaned up later.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/raid5.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 914af667044f..20ed4c997636 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -212,6 +212,7 @@ struct raid5_private_data {
 	mddev_t			*mddev;
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
+	int			max_degraded;
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
-- 
cgit v1.2.3


From 5fd6c1dce06ec24ef3de20fe0c7ecf2ba9fe5ef9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:40 -0700
Subject: [PATCH] md: allow checkpoint of recovery with version-1 superblock

For a while we have had checkpointing of resync.  The version-1 superblock
allows recovery to be checkpointed as well, and this patch implements that.

Due to early carelessness we need to add a feature flag to signal that the
recovery_offset field is in use, otherwise older kernels would assume that a
partially recovered array is in fact fully recovered.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 6 ++++++
 include/linux/raid/md_p.h | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index db2ca2d9066e..682574f3bd36 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -88,6 +88,10 @@ struct mdk_rdev_s
 					 * array and could again if we did a partial
 					 * resync from the bitmap
 					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
 
 	atomic_t	nr_pending;	/* number of pending requests.
 					 * only maintained for arrays that
@@ -183,6 +187,8 @@ struct mddev_s
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
 #define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index f1fbae7e390e..b6ebc69bae54 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -265,9 +265,12 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RECOVERY_OFFSET	2 /* recovery_offset is present and
+					   * must be honoured
+					   */
 #define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			5
+#define	MD_FEATURE_ALL			(1|2|4)
 
 #endif 
 
-- 
cgit v1.2.3


From 7c7546ccf6463edbeee8d9aac6de7be1cd80d08a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:41 -0700
Subject: [PATCH] md: allow a linear array to have drives added while active

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/linear.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index 7eaf290e10e7..ba15469daf11 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -13,8 +13,10 @@ typedef struct dev_info dev_info_t;
 
 struct linear_private_data
 {
+	struct linear_private_data *prev;	/* earlier version */
 	dev_info_t		**hash_table;
 	sector_t		hash_spacing;
+	sector_t		array_size;
 	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };
-- 
cgit v1.2.3


From c93983bf517c100a31e40ef087e19bd3d7aa2d28 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:41 -0700
Subject: [PATCH] md: support stripe/offset mode in raid10

The "industry standard" DDF format allows for a stripe/offset layout where
data is duplicated on different stripes.  e.g.

  A  B  C  D
  D  A  B  C
  E  F  G  H
  H  E  F  G

(columns are drives, rows are stripes, LETTERS are chunks of data).

This is similar to raid10's 'far' mode, but not quite the same.  So enhance
'far' mode with a 'far/offset' option which follows the layout of DDFs
stripe/offset.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/raid10.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index b1103298a8c2..c41e56a7c090 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -24,11 +24,16 @@ struct r10_private_data_s {
 	int 			far_copies;   /* number of copies layed out
 					       * at large strides across drives
 					       */
+	int			far_offset;   /* far_copies are offset by 1 stripe
+					       * instead of many
+					       */
 	int			copies;	      /* near_copies * far_copies.
 					       * must be <= raid_disks
 					       */
 	sector_t		stride;	      /* distance between far copies.
-					       * This is size / far_copies
+					       * This is size / far_copies unless
+					       * far_offset, in which case it is
+					       * 1 stripe.
 					       */
 
 	int chunk_shift; /* shift from chunks to sectors */
-- 
cgit v1.2.3


From 5e56341d029f0c2cf31e78dc01d4c861ba4d6a5e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 26 Jun 2006 00:27:42 -0700
Subject: [PATCH] md: make md_print_devices() static

This patch makes the needlessly global md_print_devices() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 66b44e5e0d6e..eb3e547c8fee 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -85,8 +85,6 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
 extern void md_unplug_mddev(mddev_t *mddev);
 
-extern void md_print_devices (void);
-
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
@@ -97,7 +95,5 @@ extern void md_new_event(mddev_t *mddev);
 
 extern void md_update_sb(mddev_t * mddev);
 
-#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
-
 #endif 
 
-- 
cgit v1.2.3


From 0b79ccf0cdd9f59e5f99017e1a5d23da336544b2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:44 -0700
Subject: [PATCH] md/bitmap: remove bitmap writeback daemon

md/bitmap currently has a separate thread to wait for writes to the bitmap
file to complete (as we cannot get a callback on that action).

However this isn't needed as bitmap_unplug is called from process context and
waits for the writeback thread to do it's work.  The same result can be
achieved by doing the waiting directly in bitmap_unplug.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/bitmap.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 899437802aea..9c8907ca60a7 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -244,13 +244,7 @@ struct bitmap {
 	unsigned long daemon_lastrun; /* jiffies of last run */
 	unsigned long daemon_sleep; /* how many seconds between updates? */
 
-	/*
-	 * bitmap_writeback_daemon waits for file-pages that have been written,
-	 * as there is no way to get a call-back when a page write completes.
-	 */
-	mdk_thread_t *writeback_daemon;
 	spinlock_t write_lock;
-	wait_queue_head_t write_wait;
 	struct list_head complete_pages;
 	mempool_t *write_pool;
 };
-- 
cgit v1.2.3


From d785a06a0b9d0cd86b3cc1bf8e236e62af7b47ed Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:48 -0700
Subject: [PATCH] md/bitmap: change md/bitmap file handling to use bmap to file
 blocks

If md is asked to store a bitmap in a file, it tries to hold onto the page
cache pages for that file, manipulate them directly, and call a cocktail of
operations to write the file out.  I don't believe this is a supportable
approach.

This patch changes the approach to use the same approach as swap files.  i.e.
bmap is used to enumerate all the block address of parts of the file and we
write directly to those blocks of the device.

swapfile only uses parts of the file that provide a full pages at contiguous
addresses.  We don't have that luxury so we have to cope with pages that are
non-contiguous in storage.  To handle this we attach buffers to each page, and
store the addresses in those buffers.

With this approach the pagecache may contain data which is inconsistent with
what is on disk.  To alleviate the problems this can cause, md invalidates the
pagecache when releasing the file.  If the file is to be examined while the
array is active (a non-critical but occasionally useful function), O_DIRECT io
must be used.  And new version of mdadm will have support for this.

This approach simplifies a lot of code:
 - we no longer need to keep a list of pages which we need to wait for,
   as the b_endio function can keep track of how many outstanding
   writes there are.  This saves a mempool.
 - -EAGAIN returns from write_page are no longer possible (not sure if
    they ever were actually).

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/bitmap.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 9c8907ca60a7..63df898fe2e9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -140,6 +140,7 @@ typedef __u16 bitmap_counter_t;
 enum bitmap_state {
 	BITMAP_ACTIVE = 0x001, /* the bitmap is in use */
 	BITMAP_STALE  = 0x002,  /* the bitmap file is out of date or had -EIO */
+	BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
 	BITMAP_HOSTENDIAN = 0x8000,
 };
 
@@ -244,9 +245,9 @@ struct bitmap {
 	unsigned long daemon_lastrun; /* jiffies of last run */
 	unsigned long daemon_sleep; /* how many seconds between updates? */
 
-	spinlock_t write_lock;
-	struct list_head complete_pages;
-	mempool_t *write_pool;
+	atomic_t pending_writes; /* pending writes to the bitmap file */
+	wait_queue_head_t write_wait;
+
 };
 
 /* the bitmap API */
-- 
cgit v1.2.3


From 42543769142d2375f2b5f8fc9cac999f84bd4c4c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 26 Jun 2006 00:27:57 -0700
Subject: [PATCH] md: Don't write dirty/clean update to spares - leave them
 alone

- record the 'event' count on each individual device (they
  might sometimes be slightly different now)
- add a new value for 'sb_dirty': '3' means that the super
  block only needs to be updated to record a clean<->dirty
  transition.
- Prefer odd event numbers for dirty states and even numbers
  for clean states
- Using all the above, don't update the superblock on
  a spare device if the update is just doing a clean-dirty
  transition.  To accomodate this, a transition from
  dirty back to clean might now decrement the events counter
  if nothing else has changed.

The net effect of this is that spare drives will not see any IO requests
during normal running of the array, so they can go to sleep if that is what
they want to do.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 682574f3bd36..c1e0ac55bab5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -58,6 +58,7 @@ struct mdk_rdev_s
 
 	struct page	*sb_page;
 	int		sb_loaded;
+	__u64		sb_events;
 	sector_t	data_offset;	/* start of data in array */
 	sector_t	sb_offset;
 	int		sb_size;	/* bytes in the superblock */
-- 
cgit v1.2.3