summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c39
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c395
-rw-r--r--drivers/md/mktables.c187
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c8
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c48
-rw-r--r--drivers/md/raid6test/test.c117
11 files changed, 542 insertions, 270 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1b1ef3130e6e..a0585fb6da94 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -237,7 +237,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
if (!page)
return ERR_PTR(-ENOMEM);
- ITERATE_RDEV(mddev, rdev, tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (! test_bit(In_sync, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
@@ -261,7 +261,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
struct list_head *tmp;
mddev_t *mddev = bitmap->mddev;
- ITERATE_RDEV(mddev, rdev, tmp)
+ rdev_for_each(rdev, tmp, mddev)
if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE;
@@ -1348,14 +1348,38 @@ void bitmap_close_sync(struct bitmap *bitmap)
*/
sector_t sector = 0;
int blocks;
- if (!bitmap) return;
+ if (!bitmap)
+ return;
while (sector < bitmap->mddev->resync_max_sectors) {
bitmap_end_sync(bitmap, sector, &blocks, 0);
-/*
- if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n",
- (unsigned long long)sector, blocks);
-*/ sector += blocks;
+ sector += blocks;
+ }
+}
+
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+{
+ sector_t s = 0;
+ int blocks;
+
+ if (!bitmap)
+ return;
+ if (sector == 0) {
+ bitmap->last_end_sync = jiffies;
+ return;
+ }
+ if (time_before(jiffies, (bitmap->last_end_sync
+ + bitmap->daemon_sleep * HZ)))
+ return;
+ wait_event(bitmap->mddev->recovery_wait,
+ atomic_read(&bitmap->mddev->recovery_active) == 0);
+
+ sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
+ s = 0;
+ while (s < sector && s < bitmap->mddev->resync_max_sectors) {
+ bitmap_end_sync(bitmap, s, &blocks, 0);
+ s += blocks;
}
+ bitmap->last_end_sync = jiffies;
}
static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
@@ -1565,3 +1589,4 @@ EXPORT_SYMBOL(bitmap_start_sync);
EXPORT_SYMBOL(bitmap_end_sync);
EXPORT_SYMBOL(bitmap_unplug);
EXPORT_SYMBOL(bitmap_close_sync);
+EXPORT_SYMBOL(bitmap_cond_end_sync);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index cf2ddce34118..d107ddceefcd 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -294,7 +294,7 @@ static int run(mddev_t *mddev)
}
conf->nfaults = 0;
- ITERATE_RDEV(mddev, rdev, tmp)
+ rdev_for_each(rdev, tmp, mddev)
conf->rdev = rdev;
mddev->array_size = mddev->size;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3dac1cfb8189..0b8511776b3e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -122,7 +122,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
cnt = 0;
conf->array_size = 0;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c28a120b4161..5fc326d3970e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -195,7 +195,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
* Any code which breaks out of this loop while own
* a reference to the current mddev and must mddev_put it.
*/
-#define ITERATE_MDDEV(mddev,tmp) \
+#define for_each_mddev(mddev,tmp) \
\
for (({ spin_lock(&all_mddevs_lock); \
tmp = all_mddevs.next; \
@@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit)
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
new->reshape_position = MaxSector;
+ new->resync_max = MaxSector;
new->queue = blk_alloc_queue(GFP_KERNEL);
if (!new->queue) {
@@ -310,7 +311,7 @@ static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
mdk_rdev_t * rdev;
struct list_head *tmp;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (rdev->desc_nr == nr)
return rdev;
}
@@ -322,7 +323,7 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
struct list_head *tmp;
mdk_rdev_t *rdev;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (rdev->bdev->bd_dev == dev)
return rdev;
}
@@ -773,12 +774,16 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
__u64 ev1 = md_event(sb);
rdev->raid_disk = -1;
- rdev->flags = 0;
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+ clear_bit(BarriersNotsupp, &rdev->flags);
+
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
mddev->minor_version = sb->minor_version;
mddev->patch_version = sb->patch_version;
- mddev->persistent = ! sb->not_persistent;
+ mddev->external = 0;
mddev->chunk_size = sb->chunk_size;
mddev->ctime = sb->ctime;
mddev->utime = sb->utime;
@@ -904,7 +909,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->size = mddev->size;
sb->raid_disks = mddev->raid_disks;
sb->md_minor = mddev->md_minor;
- sb->not_persistent = !mddev->persistent;
+ sb->not_persistent = 0;
sb->utime = mddev->utime;
sb->state = 0;
sb->events_hi = (mddev->events>>32);
@@ -938,7 +943,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->state |= (1<<MD_SB_BITMAP_PRESENT);
sb->disks[0].state = (1<<MD_DISK_REMOVED);
- ITERATE_RDEV(mddev,rdev2,tmp) {
+ rdev_for_each(rdev2, tmp, mddev) {
mdp_disk_t *d;
int desc_nr;
if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
@@ -1153,11 +1158,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
__u64 ev1 = le64_to_cpu(sb->events);
rdev->raid_disk = -1;
- rdev->flags = 0;
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+ clear_bit(BarriersNotsupp, &rdev->flags);
+
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
mddev->patch_version = 0;
- mddev->persistent = 1;
+ mddev->external = 0;
mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
@@ -1286,7 +1295,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
}
max_dev = 0;
- ITERATE_RDEV(mddev,rdev2,tmp)
+ rdev_for_each(rdev2, tmp, mddev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
@@ -1295,7 +1304,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
- ITERATE_RDEV(mddev,rdev2,tmp) {
+ rdev_for_each(rdev2, tmp, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1333,8 +1342,8 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2;
- ITERATE_RDEV(mddev1,rdev,tmp)
- ITERATE_RDEV(mddev2, rdev2, tmp2)
+ rdev_for_each(rdev, tmp, mddev1)
+ rdev_for_each(rdev2, tmp2, mddev2)
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains)
return 1;
@@ -1401,7 +1410,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
goto fail;
}
list_add(&rdev->same_set, &mddev->disks);
- bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
+ bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0;
fail:
@@ -1410,10 +1419,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
return err;
}
-static void delayed_delete(struct work_struct *ws)
+static void md_delayed_delete(struct work_struct *ws)
{
mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
kobject_del(&rdev->kobj);
+ kobject_put(&rdev->kobj);
}
static void unbind_rdev_from_array(mdk_rdev_t * rdev)
@@ -1432,7 +1442,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state"
*/
- INIT_WORK(&rdev->del_work, delayed_delete);
+ INIT_WORK(&rdev->del_work, md_delayed_delete);
+ kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work);
}
@@ -1441,7 +1452,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
* otherwise reused by a RAID array (or any other kernel
* subsystem), by bd_claiming the device.
*/
-static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
{
int err = 0;
struct block_device *bdev;
@@ -1453,13 +1464,15 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
__bdevname(dev, b));
return PTR_ERR(bdev);
}
- err = bd_claim(bdev, rdev);
+ err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
if (err) {
printk(KERN_ERR "md: could not bd_claim %s.\n",
bdevname(bdev, b));
blkdev_put(bdev);
return err;
}
+ if (!shared)
+ set_bit(AllReserved, &rdev->flags);
rdev->bdev = bdev;
return err;
}
@@ -1503,7 +1516,7 @@ static void export_array(mddev_t *mddev)
struct list_head *tmp;
mdk_rdev_t *rdev;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (!rdev->mddev) {
MD_BUG();
continue;
@@ -1581,17 +1594,17 @@ static void md_print_devices(void)
printk("md: **********************************\n");
printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
printk("md: **********************************\n");
- ITERATE_MDDEV(mddev,tmp) {
+ for_each_mddev(mddev, tmp) {
if (mddev->bitmap)
bitmap_print_sb(mddev->bitmap);
else
printk("%s: ", mdname(mddev));
- ITERATE_RDEV(mddev,rdev,tmp2)
+ rdev_for_each(rdev, tmp2, mddev)
printk("<%s>", bdevname(rdev->bdev,b));
printk("\n");
- ITERATE_RDEV(mddev,rdev,tmp2)
+ rdev_for_each(rdev, tmp2, mddev)
print_rdev(rdev);
}
printk("md: **********************************\n");
@@ -1610,7 +1623,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
mdk_rdev_t *rdev;
struct list_head *tmp;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (rdev->sb_events == mddev->events ||
(nospares &&
rdev->raid_disk < 0 &&
@@ -1696,18 +1709,20 @@ repeat:
MD_BUG();
mddev->events --;
}
- sync_sbs(mddev, nospares);
/*
* do not write anything to disk if using
* nonpersistent superblocks
*/
if (!mddev->persistent) {
- clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ if (!mddev->external)
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
return;
}
+ sync_sbs(mddev, nospares);
spin_unlock_irq(&mddev->write_lock);
dprintk(KERN_INFO
@@ -1715,7 +1730,7 @@ repeat:
mdname(mddev),mddev->in_sync);
bitmap_update_sb(mddev->bitmap);
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
char b[BDEVNAME_SIZE];
dprintk(KERN_INFO "md: ");
if (rdev->sb_loaded != 1)
@@ -1785,7 +1800,7 @@ static ssize_t
state_show(mdk_rdev_t *rdev, char *page)
{
char *sep = "";
- int len=0;
+ size_t len = 0;
if (test_bit(Faulty, &rdev->flags)) {
len+= sprintf(page+len, "%sfaulty",sep);
@@ -1887,20 +1902,45 @@ static ssize_t
slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
char *e;
+ int err;
+ char nm[20];
int slot = simple_strtoul(buf, &e, 10);
if (strncmp(buf, "none", 4)==0)
slot = -1;
else if (e==buf || (*e && *e!= '\n'))
return -EINVAL;
- if (rdev->mddev->pers)
- /* Cannot set slot in active array (yet) */
- return -EBUSY;
- if (slot >= rdev->mddev->raid_disks)
- return -ENOSPC;
- rdev->raid_disk = slot;
- /* assume it is working */
- rdev->flags = 0;
- set_bit(In_sync, &rdev->flags);
+ if (rdev->mddev->pers) {
+ /* Setting 'slot' on an active array requires also
+ * updating the 'rd%d' link, and communicating
+ * with the personality with ->hot_*_disk.
+ * For now we only support removing
+ * failed/spare devices. This normally happens automatically,
+ * but not when the metadata is externally managed.
+ */
+ if (slot != -1)
+ return -EBUSY;
+ if (rdev->raid_disk == -1)
+ return -EEXIST;
+ /* personality does all needed checks */
+ if (rdev->mddev->pers->hot_add_disk == NULL)
+ return -EINVAL;
+ err = rdev->mddev->pers->
+ hot_remove_disk(rdev->mddev, rdev->raid_disk);
+ if (err)
+ return err;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&rdev->mddev->kobj, nm);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+ } else {
+ if (slot >= rdev->mddev->raid_disks)
+ return -ENOSPC;
+ rdev->raid_disk = slot;
+ /* assume it is working */
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+ set_bit(In_sync, &rdev->flags);
+ }
return len;
}
@@ -1923,6 +1963,10 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EINVAL;
if (rdev->mddev->pers)
return -EBUSY;
+ if (rdev->size && rdev->mddev->external)
+ /* Must set offset before size, so overlap checks
+ * can be sane */
+ return -EBUSY;
rdev->data_offset = offset;
return len;
}
@@ -1936,16 +1980,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
}
+static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+{
+ /* check if two start/length pairs overlap */
+ if (s1+l1 <= s2)
+ return 0;
+ if (s2+l2 <= s1)
+ return 0;
+ return 1;
+}
+
static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
char *e;
unsigned long long size = simple_strtoull(buf, &e, 10);
+ unsigned long long oldsize = rdev->size;
if (e==buf || (*e && *e != '\n'))
return -EINVAL;
if (rdev->mddev->pers)
return -EBUSY;
rdev->size = size;
+ if (size > oldsize && rdev->mddev->external) {
+ /* need to check that all other rdevs with the same ->bdev
+ * do not overlap. We need to unlock the mddev to avoid
+ * a deadlock. We have already changed rdev->size, and if
+ * we have to change it back, we will have the lock again.
+ */
+ mddev_t *mddev;
+ int overlap = 0;
+ struct list_head *tmp, *tmp2;
+
+ mddev_unlock(rdev->mddev);
+ for_each_mddev(mddev, tmp) {
+ mdk_rdev_t *rdev2;
+
+ mddev_lock(mddev);
+ rdev_for_each(rdev2, tmp2, mddev)
+ if (test_bit(AllReserved, &rdev2->flags) ||
+ (rdev->bdev == rdev2->bdev &&
+ rdev != rdev2 &&
+ overlaps(rdev->data_offset, rdev->size,
+ rdev2->data_offset, rdev2->size))) {
+ overlap = 1;
+ break;
+ }
+ mddev_unlock(mddev);
+ if (overlap) {
+ mddev_put(mddev);
+ break;
+ }
+ }
+ mddev_lock(rdev->mddev);
+ if (overlap) {
+ /* Someone else could have slipped in a size
+ * change here, but doing so is just silly.
+ * We put oldsize back because we *know* it is
+ * safe, and trust userspace not to race with
+ * itself
+ */
+ rdev->size = oldsize;
+ return -EBUSY;
+ }
+ }
if (size < rdev->mddev->size || rdev->mddev->size == 0)
rdev->mddev->size = size;
return len;
@@ -1980,12 +2077,18 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+ int rv;
if (!entry->store)
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- return entry->store(rdev, page, length);
+ rv = mddev_lock(rdev->mddev);
+ if (!rv) {
+ rv = entry->store(rdev, page, length);
+ mddev_unlock(rdev->mddev);
+ }
+ return rv;
}
static void rdev_free(struct kobject *ko)
@@ -2029,7 +2132,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
if ((err = alloc_disk_sb(rdev)))
goto abort_free;
- err = lock_rdev(rdev, newdev);
+ err = lock_rdev(rdev, newdev, super_format == -2);
if (err)
goto abort_free;
@@ -2099,7 +2202,7 @@ static void analyze_sbs(mddev_t * mddev)
char b[BDEVNAME_SIZE];
freshest = NULL;
- ITERATE_RDEV(mddev,rdev,tmp)
+ rdev_for_each(rdev, tmp, mddev)
switch (super_types[mddev->major_version].
load_super(rdev, freshest, mddev->minor_version)) {
case 1:
@@ -2120,7 +2223,7 @@ static void analyze_sbs(mddev_t * mddev)
validate_super(mddev, freshest);
i = 0;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (rdev != freshest)
if (super_types[mddev->major_version].
validate_super(mddev, rdev)) {
@@ -2215,7 +2318,7 @@ level_show(mddev_t *mddev, char *page)
static ssize_t
level_store(mddev_t *mddev, const char *buf, size_t len)
{
- int rv = len;
+ ssize_t rv = len;
if (mddev->pers)
return -EBUSY;
if (len == 0)
@@ -2425,6 +2528,8 @@ array_state_show(mddev_t *mddev, char *page)
case 0:
if (mddev->in_sync)
st = clean;
+ else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ st = write_pending;
else if (mddev->safemode)
st = active_idle;
else
@@ -2455,11 +2560,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break;
case clear:
/* stopping an active array */
- if (mddev->pers) {
- if (atomic_read(&mddev->active) > 1)
- return -EBUSY;
- err = do_md_stop(mddev, 0);
- }
+ if (atomic_read(&mddev->active) > 1)
+ return -EBUSY;
+ err = do_md_stop(mddev, 0);
break;
case inactive:
/* stopping an active array */
@@ -2467,7 +2570,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
if (atomic_read(&mddev->active) > 1)
return -EBUSY;
err = do_md_stop(mddev, 2);
- }
+ } else
+ err = 0; /* already inactive */
break;
case suspended:
break; /* not supported yet */
@@ -2495,9 +2599,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
restart_array(mddev);
spin_lock_irq(&mddev->write_lock);
if (atomic_read(&mddev->writes_pending) == 0) {
- mddev->in_sync = 1;
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
- }
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->persistent)
+ set_bit(MD_CHANGE_CLEAN,
+ &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
spin_unlock_irq(&mddev->write_lock);
} else {
mddev->ro = 0;
@@ -2508,7 +2618,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case active:
if (mddev->pers) {
restart_array(mddev);
- clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ if (mddev->external)
+ clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
} else {
@@ -2574,7 +2685,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
if (err < 0)
goto out;
}
- } else
+ } else if (mddev->external)
+ rdev = md_import_device(dev, -2, -1);
+ else
rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev))
@@ -2659,7 +2772,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
/* Metdata version.
- * This is either 'none' for arrays with externally managed metadata,
+ * This is one of
+ * 'none' for arrays with no metadata (good luck...)
+ * 'external' for arrays with externally managed metadata,
* or N.M for internally known formats
*/
static ssize_t
@@ -2668,6 +2783,8 @@ metadata_show(mddev_t *mddev, char *page)
if (mddev->persistent)
return sprintf(page, "%d.%d\n",
mddev->major_version, mddev->minor_version);
+ else if (mddev->external)
+ return sprintf(page, "external:%s\n", mddev->metadata_type);
else
return sprintf(page, "none\n");
}
@@ -2682,6 +2799,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
if (cmd_match(buf, "none")) {
mddev->persistent = 0;
+ mddev->external = 0;
+ mddev->major_version = 0;
+ mddev->minor_version = 90;
+ return len;
+ }
+ if (strncmp(buf, "external:", 9) == 0) {
+ size_t namelen = len-9;
+ if (namelen >= sizeof(mddev->metadata_type))
+ namelen = sizeof(mddev->metadata_type)-1;
+ strncpy(mddev->metadata_type, buf+9, namelen);
+ mddev->metadata_type[namelen] = 0;
+ if (namelen && mddev->metadata_type[namelen-1] == '\n')
+ mddev->metadata_type[--namelen] = 0;
+ mddev->persistent = 0;
+ mddev->external = 1;
mddev->major_version = 0;
mddev->minor_version = 90;
return len;
@@ -2698,6 +2830,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
mddev->major_version = major;
mddev->minor_version = minor;
mddev->persistent = 1;
+ mddev->external = 0;
return len;
}
@@ -2865,6 +2998,43 @@ sync_completed_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
static ssize_t
+max_sync_show(mddev_t *mddev, char *page)
+{
+ if (mddev->resync_max == MaxSector)
+ return sprintf(page, "max\n");
+ else
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->resync_max);
+}
+static ssize_t
+max_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ if (strncmp(buf, "max", 3) == 0)
+ mddev->resync_max = MaxSector;
+ else {
+ char *ep;
+ unsigned long long max = simple_strtoull(buf, &ep, 10);
+ if (ep == buf || (*ep != 0 && *ep != '\n'))
+ return -EINVAL;
+ if (max < mddev->resync_max &&
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ /* Must be a multiple of chunk_size */
+ if (mddev->chunk_size) {
+ if (max & (sector_t)((mddev->chunk_size>>9)-1))
+ return -EINVAL;
+ }
+ mddev->resync_max = max;
+ }
+ wake_up(&mddev->recovery_wait);
+ return len;
+}
+
+static struct md_sysfs_entry md_max_sync =
+__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
+
+static ssize_t
suspend_lo_show(mddev_t *mddev, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
@@ -2974,6 +3144,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_max.attr,
&md_sync_speed.attr,
&md_sync_completed.attr,
+ &md_max_sync.attr,
&md_suspend_lo.attr,
&md_suspend_hi.attr,
&md_bitmap.attr,
@@ -3118,8 +3289,11 @@ static int do_md_run(mddev_t * mddev)
/*
* Analyze all RAID superblock(s)
*/
- if (!mddev->raid_disks)
+ if (!mddev->raid_disks) {
+ if (!mddev->persistent)
+ return -EINVAL;
analyze_sbs(mddev);
+ }
chunk_size = mddev->chunk_size;
@@ -3143,7 +3317,7 @@ static int do_md_run(mddev_t * mddev)
}
/* devices must have minimum size of one chunk */
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (test_bit(Faulty, &rdev->flags))
continue;
if (rdev->size < chunk_size / 1024) {
@@ -3170,7 +3344,7 @@ static int do_md_run(mddev_t * mddev)
* the only valid external interface is through the md
* device.
*/
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
if (test_bit(Faulty, &rdev->flags))
continue;
sync_blockdev(rdev->bdev);
@@ -3236,8 +3410,8 @@ static int do_md_run(mddev_t * mddev)
mdk_rdev_t *rdev2;
struct list_head *tmp2;
int warned = 0;
- ITERATE_RDEV(mddev, rdev, tmp) {
- ITERATE_RDEV(mddev, rdev2, tmp2) {
+ rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each(rdev2, tmp2, mddev) {
if (rdev < rdev2 &&
rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
@@ -3297,7 +3471,7 @@ static int do_md_run(mddev_t * mddev)
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
- ITERATE_RDEV(mddev,rdev,tmp)
+ rdev_for_each(rdev, tmp, mddev)
if (rdev->raid_disk >= 0) {
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3330,7 +3504,7 @@ static int do_md_run(mddev_t * mddev)
if (mddev->degraded && !mddev->sync_thread) {
struct list_head *rtmp;
int spares = 0;
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
@@ -3507,14 +3681,14 @@ static int do_md_stop(mddev_t * mddev, int mode)
}
mddev->bitmap_offset = 0;
- ITERATE_RDEV(mddev,rdev,tmp)
+ rdev_for_each(rdev, tmp, mddev)
if (rdev->raid_disk >= 0) {
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
- /* make sure all delayed_delete calls have finished */
+ /* make sure all md_delayed_delete calls have finished */
flush_scheduled_work();
export_array(mddev);
@@ -3523,7 +3697,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
mddev->size = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
+ mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
+ mddev->external = 0;
+ mddev->persistent = 0;
} else if (mddev->pers)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -3546,7 +3723,7 @@ static void autorun_array(mddev_t *mddev)
printk(KERN_INFO "md: running: ");
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
char b[BDEVNAME_SIZE];
printk("<%s>", bdevname(rdev->bdev,b));
}
@@ -3589,7 +3766,7 @@ static void autorun_devices(int part)
printk(KERN_INFO "md: considering %s ...\n",
bdevname(rdev0->bdev,b));
INIT_LIST_HEAD(&candidates);
- ITERATE_RDEV_PENDING(rdev,tmp)
+ rdev_for_each_list(rdev, tmp, pending_raid_disks)
if (super_90_load(rdev, rdev0, 0) >= 0) {
printk(KERN_INFO "md: adding %s ...\n",
bdevname(rdev->bdev,b));
@@ -3632,7 +3809,8 @@ static void autorun_devices(int part)
mddev_unlock(mddev);
} else {
printk(KERN_INFO "md: created %s\n", mdname(mddev));
- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ mddev->persistent = 1;
+ rdev_for_each_list(rdev, tmp, candidates) {
list_del_init(&rdev->same_set);
if (bind_rdev_to_array(rdev, mddev))
export_rdev(rdev);
@@ -3643,7 +3821,7 @@ static void autorun_devices(int part)
/* on success, candidates will be empty, on error
* it won't...
*/
- ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ rdev_for_each_list(rdev, tmp, candidates)
export_rdev(rdev);
mddev_put(mddev);
}
@@ -3673,7 +3851,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
struct list_head *tmp;
nr=working=active=failed=spare=0;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
nr++;
if (test_bit(Faulty, &rdev->flags))
failed++;
@@ -3919,8 +4097,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
else
rdev->raid_disk = -1;
- rdev->flags = 0;
-
if (rdev->raid_disk < mddev->raid_disks)
if (info->state & (1<<MD_DISK_SYNC))
set_bit(In_sync, &rdev->flags);
@@ -4165,13 +4341,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
else
mddev->recovery_cp = 0;
mddev->persistent = ! info->not_persistent;
+ mddev->external = 0;
mddev->layout = info->layout;
mddev->chunk_size = info->chunk_size;
mddev->max_disks = MD_SB_DISKS;
- mddev->flags = 0;
+ if (mddev->persistent)
+ mddev->flags = 0;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -4213,7 +4391,7 @@ static int update_size(mddev_t *mddev, unsigned long size)
*/
if (mddev->sync_thread)
return -EBUSY;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
sector_t avail;
avail = rdev->size * 2;
@@ -4471,9 +4649,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
*/
/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
* RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
- if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
- && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
- && cmd != GET_BITMAP_FILE) {
+ if ((!mddev->raid_disks && !mddev->external)
+ && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
+ && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
+ && cmd != GET_BITMAP_FILE) {
err = -ENODEV;
goto abort_unlock;
}
@@ -4757,7 +4936,7 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "unused devices: ");
- ITERATE_RDEV_PENDING(rdev,tmp) {
+ rdev_for_each_list(rdev, tmp, pending_raid_disks) {
char b[BDEVNAME_SIZE];
i++;
seq_printf(seq, "%s ",
@@ -4953,7 +5132,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
size = 0;
- ITERATE_RDEV(mddev,rdev,tmp2) {
+ rdev_for_each(rdev, tmp2, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -4982,7 +5161,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->major_version,
mddev->minor_version);
}
- } else
+ } else if (mddev->external)
+ seq_printf(seq, " super external:%s",
+ mddev->metadata_type);
+ else
seq_printf(seq, " super non-persistent");
if (mddev->pers) {
@@ -5106,7 +5288,7 @@ static int is_mddev_idle(mddev_t *mddev)
long curr_events;
idle = 1;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) -
@@ -5283,7 +5465,7 @@ void md_do_sync(mddev_t *mddev)
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
goto skip;
}
- ITERATE_MDDEV(mddev2,tmp) {
+ for_each_mddev(mddev2, tmp) {
if (mddev2 == mddev)
continue;
if (mddev2->curr_resync &&
@@ -5333,7 +5515,7 @@ void md_do_sync(mddev_t *mddev)
/* recovery follows the physical size of devices */
max_sectors = mddev->size << 1;
j = MaxSector;
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
@@ -5381,8 +5563,16 @@ void md_do_sync(mddev_t *mddev)
sector_t sectors;
skipped = 0;
+ if (j >= mddev->resync_max) {
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ wait_event(mddev->recovery_wait,
+ mddev->resync_max > j
+ || kthread_should_stop());
+ }
+ if (kthread_should_stop())
+ goto interrupted;
sectors = mddev->pers->sync_request(mddev, j, &skipped,
- currspeed < speed_min(mddev));
+ currspeed < speed_min(mddev));
if (sectors == 0) {
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
goto out;
@@ -5424,15 +5614,9 @@ void md_do_sync(mddev_t *mddev)
}
- if (kthread_should_stop()) {
- /*
- * got a signal, exit.
- */
- printk(KERN_INFO
- "md: md_do_sync() got signal ... exiting\n");
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- goto out;
- }
+ if (kthread_should_stop())
+ goto interrupted;
+
/*
* this loop exits only if either when we are slower than
@@ -5484,7 +5668,7 @@ void md_do_sync(mddev_t *mddev)
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
@@ -5496,9 +5680,22 @@ void md_do_sync(mddev_t *mddev)
skip:
mddev->curr_resync = 0;
+ mddev->resync_max = MaxSector;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
+ return;
+
+ interrupted:
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+
}
EXPORT_SYMBOL_GPL(md_do_sync);
@@ -5509,8 +5706,9 @@ static int remove_and_add_spares(mddev_t *mddev)
struct list_head *rtmp;
int spares = 0;
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk >= 0 &&
+ !mddev->external &&
(test_bit(Faulty, &rdev->flags) ||
! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) {
@@ -5524,7 +5722,7 @@ static int remove_and_add_spares(mddev_t *mddev)
}
if (mddev->degraded) {
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
@@ -5589,7 +5787,7 @@ void md_check_recovery(mddev_t *mddev)
}
if ( ! (
- mddev->flags ||
+ (mddev->flags && !mddev->external) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->safemode == 1) ||
@@ -5605,7 +5803,8 @@ void md_check_recovery(mddev_t *mddev)
if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ if (mddev->persistent)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
@@ -5637,7 +5836,7 @@ void md_check_recovery(mddev_t *mddev)
* information must be scrapped
*/
if (!mddev->degraded)
- ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
rdev->saved_raid_disk = -1;
mddev->recovery = 0;
@@ -5714,7 +5913,7 @@ static int md_notify_reboot(struct notifier_block *this,
printk(KERN_INFO "md: stopping all md devices.\n");
- ITERATE_MDDEV(mddev,tmp)
+ for_each_mddev(mddev, tmp)
if (mddev_trylock(mddev)) {
do_md_stop (mddev, 1);
mddev_unlock(mddev);
@@ -5848,7 +6047,7 @@ static __exit void md_exit(void)
unregister_reboot_notifier(&md_notifier);
unregister_sysctl_table(raid_table_header);
remove_proc_entry("mdstat", NULL);
- ITERATE_MDDEV(mddev,tmp) {
+ for_each_mddev(mddev, tmp) {
struct gendisk *disk = mddev->gendisk;
if (!disk)
continue;
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index adef299908cf..b61d5767aae7 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -1,13 +1,10 @@
-#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $"
-/* ----------------------------------------------------------------------- *
+/* -*- linux-c -*- ------------------------------------------------------- *
*
- * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
- * (at your option) any later version; incorporated herein by reference.
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -26,100 +23,98 @@
static uint8_t gfmul(uint8_t a, uint8_t b)
{
- uint8_t v = 0;
-
- while ( b ) {
- if ( b & 1 ) v ^= a;
- a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
- b >>= 1;
- }
- return v;
+ uint8_t v = 0;
+
+ while (b) {
+ if (b & 1)
+ v ^= a;
+ a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+ b >>= 1;
+ }
+
+ return v;
}
static uint8_t gfpow(uint8_t a, int b)
{
- uint8_t v = 1;
-
- b %= 255;
- if ( b < 0 )
- b += 255;
-
- while ( b ) {
- if ( b & 1 ) v = gfmul(v,a);
- a = gfmul(a,a);
- b >>= 1;
- }
- return v;
+ uint8_t v = 1;
+
+ b %= 255;
+ if (b < 0)
+ b += 255;
+
+ while (b) {
+ if (b & 1)
+ v = gfmul(v, a);
+ a = gfmul(a, a);
+ b >>= 1;
+ }
+
+ return v;
}
int main(int argc, char *argv[])
{
- int i, j, k;
- uint8_t v;
- uint8_t exptbl[256], invtbl[256];
-
- printf("#include \"raid6.h\"\n");
-
- /* Compute multiplication table */
- printf("\nconst u8 __attribute__((aligned(256)))\n"
- "raid6_gfmul[256][256] =\n"
- "{\n");
- for ( i = 0 ; i < 256 ; i++ ) {
- printf("\t{\n");
- for ( j = 0 ; j < 256 ; j += 8 ) {
- printf("\t\t");
- for ( k = 0 ; k < 8 ; k++ ) {
- printf("0x%02x, ", gfmul(i,j+k));
- }
- printf("\n");
- }
- printf("\t},\n");
- }
- printf("};\n");
-
- /* Compute power-of-2 table (exponent) */
- v = 1;
- printf("\nconst u8 __attribute__((aligned(256)))\n"
- "raid6_gfexp[256] =\n"
- "{\n");
- for ( i = 0 ; i < 256 ; i += 8 ) {
- printf("\t");
- for ( j = 0 ; j < 8 ; j++ ) {
- exptbl[i+j] = v;
- printf("0x%02x, ", v);
- v = gfmul(v,2);
- if ( v == 1 ) v = 0; /* For entry 255, not a real entry */
- }
- printf("\n");
- }
- printf("};\n");
-
- /* Compute inverse table x^-1 == x^254 */
- printf("\nconst u8 __attribute__((aligned(256)))\n"
- "raid6_gfinv[256] =\n"
- "{\n");
- for ( i = 0 ; i < 256 ; i += 8 ) {
- printf("\t");
- for ( j = 0 ; j < 8 ; j++ ) {
- invtbl[i+j] = v = gfpow(i+j,254);
- printf("0x%02x, ", v);
- }
- printf("\n");
- }
- printf("};\n");
-
- /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
- printf("\nconst u8 __attribute__((aligned(256)))\n"
- "raid6_gfexi[256] =\n"
- "{\n");
- for ( i = 0 ; i < 256 ; i += 8 ) {
- printf("\t");
- for ( j = 0 ; j < 8 ; j++ ) {
- printf("0x%02x, ", invtbl[exptbl[i+j]^1]);
- }
- printf("\n");
- }
- printf("};\n\n");
-
- return 0;
+ int i, j, k;
+ uint8_t v;
+ uint8_t exptbl[256], invtbl[256];
+
+ printf("#include \"raid6.h\"\n");
+
+ /* Compute multiplication table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfmul[256][256] =\n"
+ "{\n");
+ for (i = 0; i < 256; i++) {
+ printf("\t{\n");
+ for (j = 0; j < 256; j += 8) {
+ printf("\t\t");
+ for (k = 0; k < 8; k++)
+ printf("0x%02x,%c", gfmul(i, j + k),
+ (k == 7) ? '\n' : ' ');
+ }
+ printf("\t},\n");
+ }
+ printf("};\n");
+
+ /* Compute power-of-2 table (exponent) */
+ v = 1;
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfexp[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++) {
+ exptbl[i + j] = v;
+ printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+ v = gfmul(v, 2);
+ if (v == 1)
+ v = 0; /* For entry 255, not a real entry */
+ }
+ }
+ printf("};\n");
+
+ /* Compute inverse table x^-1 == x^254 */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfinv[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++) {
+ invtbl[i + j] = v = gfpow(i + j, 254);
+ printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+ }
+ }
+ printf("};\n");
+
+ /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfexi[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++)
+ printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1],
+ (j == 7) ? '\n' : ' ');
+ }
+ printf("};\n");
+
+ return 0;
}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index eb631ebed686..3f299d835a2b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -436,7 +436,7 @@ static int multipath_run (mddev_t *mddev)
}
conf->working_disks = 0;
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8e591708d1f..818b48284096 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -72,11 +72,11 @@ static int create_strip_zones (mddev_t *mddev)
*/
conf->nr_strip_zones = 0;
- ITERATE_RDEV(mddev,rdev1,tmp1) {
+ rdev_for_each(rdev1, tmp1, mddev) {
printk("raid0: looking at %s\n",
bdevname(rdev1->bdev,b));
c = 0;
- ITERATE_RDEV(mddev,rdev2,tmp2) {
+ rdev_for_each(rdev2, tmp2, mddev) {
printk("raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b),
(unsigned long long)rdev1->size);
@@ -124,7 +124,7 @@ static int create_strip_zones (mddev_t *mddev)
cnt = 0;
smallest = NULL;
zone->dev = conf->devlist;
- ITERATE_RDEV(mddev, rdev1, tmp1) {
+ rdev_for_each(rdev1, tmp1, mddev) {
int j = rdev1->raid_disk;
if (j < 0 || j >= mddev->raid_disks) {
@@ -293,7 +293,7 @@ static int raid0_run (mddev_t *mddev)
/* calculate array device size */
mddev->array_size = 0;
- ITERATE_RDEV(mddev,rdev,tmp)
+ rdev_for_each(rdev, tmp, mddev)
mddev->array_size += rdev->size;
printk("raid0 : md_size is %llu blocks.\n",
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4a69c416e045..5c7fef091cec 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1684,6 +1684,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
raise_barrier(conf);
conf->next_resync = sector_nr;
@@ -1766,6 +1767,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return rv;
}
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
nr_sectors = 0;
sync_blocks = 0;
do {
@@ -1884,7 +1887,7 @@ static int run(mddev_t *mddev)
if (!conf->r1bio_pool)
goto out_no_mem;
- ITERATE_RDEV(mddev, rdev, tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5cdcc9386200..017f58113c33 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1657,6 +1657,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return (max_sector - sector_nr) + sectors_skipped;
}
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
+
/* make sure whole request will fit in a chunk - if chunks
* are meaningful
*/
@@ -1670,6 +1673,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
/* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that
* have bi_end_io, bi_sector, bi_bdev set,
@@ -2021,7 +2026,7 @@ static int run(mddev_t *mddev)
goto out_free_conf;
}
- ITERATE_RDEV(mddev, rdev, tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e8c8157b02fc..2d6f1a51359c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3159,7 +3159,8 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
atomic_inc(&conf->preread_active_stripes);
list_add_tail(&sh->lru, &conf->handle_list);
}
- }
+ } else
+ blk_plug_device(conf->mddev->queue);
}
static void activate_bit_delay(raid5_conf_t *conf)
@@ -3549,7 +3550,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
goto retry;
}
finish_wait(&conf->wait_for_overlap, &w);
- handle_stripe(sh, NULL);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
release_stripe(sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
@@ -3698,6 +3700,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
+ /* If this takes us to the resync_max point where we have to pause,
+ * then we need to write out the superblock.
+ */
+ sector_nr += conf->chunk_size>>9;
+ if (sector_nr >= mddev->resync_max) {
+ /* Cannot proceed until we've updated the superblock... */
+ wait_event(conf->wait_for_overlap,
+ atomic_read(&conf->reshape_stripes) == 0);
+ mddev->reshape_position = conf->expand_progress;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+ || kthread_should_stop());
+ spin_lock_irq(&conf->device_lock);
+ conf->expand_lo = mddev->reshape_position;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
+ }
return conf->chunk_size>>9;
}
@@ -3734,6 +3755,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
+ /* No need to check resync_max as we never do more than one
+ * stripe, and as resync_max will always be on a chunk boundary,
+ * if the check in md_do_sync didn't fire, there is no chance
+ * of overstepping resync_max here
+ */
+
/* if there is too many failed drives and we are trying
* to resync, then assert that we are finished, because there is
* nothing we can do.
@@ -3753,6 +3780,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
if (sh == NULL) {
@@ -3864,7 +3894,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
* During the scan, completed stripes are saved for us by the interrupt
* handler, so that they will not have to wait for our next wakeup.
*/
-static void raid5d (mddev_t *mddev)
+static void raid5d(mddev_t *mddev)
{
struct stripe_head *sh;
raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -3889,12 +3919,6 @@ static void raid5d (mddev_t *mddev)
activate_bit_delay(conf);
}
- if (list_empty(&conf->handle_list) &&
- atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
- !blk_queue_plugged(mddev->queue) &&
- !list_empty(&conf->delayed_list))
- raid5_activate_delayed(conf);
-
while ((bio = remove_bio_from_retry(conf))) {
int ok;
spin_unlock_irq(&conf->device_lock);
@@ -4108,7 +4132,7 @@ static int run(mddev_t *mddev)
pr_debug("raid5: run(%s) called.\n", mdname(mddev));
- ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev_for_each(rdev, tmp, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= conf->raid_disks
|| raid_disk < 0)
@@ -4521,7 +4545,7 @@ static int raid5_start_reshape(mddev_t *mddev)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
- ITERATE_RDEV(mddev, rdev, rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags))
spares++;
@@ -4543,7 +4567,7 @@ static int raid5_start_reshape(mddev_t *mddev)
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
*/
- ITERATE_RDEV(mddev, rdev, rtmp)
+ rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev)) {
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 0d5cd57accd7..559cc41b2585 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -1,12 +1,10 @@
/* -*- linux-c -*- ------------------------------------------------------- *
*
- * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- * Bostom MA 02111-1307, USA; either version 2 of the License, or
- * (at your option) any later version; incorporated herein by reference.
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
@@ -30,67 +28,87 @@ char *dataptrs[NDISKS];
char data[NDISKS][PAGE_SIZE];
char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
-void makedata(void)
+static void makedata(void)
{
int i, j;
- for ( i = 0 ; i < NDISKS ; i++ ) {
- for ( j = 0 ; j < PAGE_SIZE ; j++ ) {
+ for (i = 0; i < NDISKS; i++) {
+ for (j = 0; j < PAGE_SIZE; j++)
data[i][j] = rand();
- }
+
dataptrs[i] = data[i];
}
}
+static char disk_type(int d)
+{
+ switch (d) {
+ case NDISKS-2:
+ return 'P';
+ case NDISKS-1:
+ return 'Q';
+ default:
+ return 'D';
+ }
+}
+
+static int test_disks(int i, int j)
+{
+ int erra, errb;
+
+ memset(recovi, 0xf0, PAGE_SIZE);
+ memset(recovj, 0xba, PAGE_SIZE);
+
+ dataptrs[i] = recovi;
+ dataptrs[j] = recovj;
+
+ raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
+
+ erra = memcmp(data[i], recovi, PAGE_SIZE);
+ errb = memcmp(data[j], recovj, PAGE_SIZE);
+
+ if (i < NDISKS-2 && j == NDISKS-1) {
+ /* We don't implement the DQ failure scenario, since it's
+ equivalent to a RAID-5 failure (XOR, then recompute Q) */
+ erra = errb = 0;
+ } else {
+ printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n",
+ raid6_call.name,
+ i, disk_type(i),
+ j, disk_type(j),
+ (!erra && !errb) ? "OK" :
+ !erra ? "ERRB" :
+ !errb ? "ERRA" : "ERRAB");
+ }
+
+ dataptrs[i] = data[i];
+ dataptrs[j] = data[j];
+
+ return erra || errb;
+}
+
int main(int argc, char *argv[])
{
- const struct raid6_calls * const * algo;
+ const struct raid6_calls *const *algo;
int i, j;
- int erra, errb;
+ int err = 0;
makedata();
- for ( algo = raid6_algos ; *algo ; algo++ ) {
- if ( !(*algo)->valid || (*algo)->valid() ) {
+ for (algo = raid6_algos; *algo; algo++) {
+ if (!(*algo)->valid || (*algo)->valid()) {
raid6_call = **algo;
/* Nuke syndromes */
memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
/* Generate assumed good syndrome */
- raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs);
-
- for ( i = 0 ; i < NDISKS-1 ; i++ ) {
- for ( j = i+1 ; j < NDISKS ; j++ ) {
- memset(recovi, 0xf0, PAGE_SIZE);
- memset(recovj, 0xba, PAGE_SIZE);
-
- dataptrs[i] = recovi;
- dataptrs[j] = recovj;
-
- raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
-
- erra = memcmp(data[i], recovi, PAGE_SIZE);
- errb = memcmp(data[j], recovj, PAGE_SIZE);
-
- if ( i < NDISKS-2 && j == NDISKS-1 ) {
- /* We don't implement the DQ failure scenario, since it's
- equivalent to a RAID-5 failure (XOR, then recompute Q) */
- } else {
- printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n",
- raid6_call.name,
- i, (i==NDISKS-2)?'P':'D',
- j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
- (!erra && !errb) ? "OK" :
- !erra ? "ERRB" :
- !errb ? "ERRA" :
- "ERRAB");
- }
-
- dataptrs[i] = data[i];
- dataptrs[j] = data[j];
- }
- }
+ raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+ (void **)&dataptrs);
+
+ for (i = 0; i < NDISKS-1; i++)
+ for (j = i+1; j < NDISKS; j++)
+ err += test_disks(i, j);
}
printf("\n");
}
@@ -99,5 +117,8 @@ int main(int argc, char *argv[])
/* Pick the best algorithm test */
raid6_select_algo();
- return 0;
+ if (err)
+ printf("\n*** ERRORS FOUND ***\n");
+
+ return err;
}