diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-07 17:07:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-07 17:07:20 -0700 |
commit | 9d2cd01b15d0782adb81e40094b67904d77b03df (patch) | |
tree | f8091fcd05f463a0b31485cfe3edcef0d0211da9 /fs/exofs | |
parent | 57d326169e878a1a37b2bccd1cf81f6809ee67b9 (diff) | |
parent | ce5d36aac26cc395fe3bc45525cdbad3644f01e5 (diff) |
Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd into next
Pull exofs raid6 support from Boaz Harrosh:
"These simple patches will enable raid6 using the kernel's raid6_pq
engine for support under exofs and pnfs-objects.
There is nothing needed to do at exofs and pnfs-obj. Just fire your
mkfs.exofs with --raid=6 (that was already supported before) and off
you go as usual. The ORE will pick up the new map and will start
writing two devices of redundancy bits. The patches are so simple
because most of the ORE was already for the general raid case, only a
few bug fixes were needed and the actual wiring into the raid6_pq
engine"
* 'for-linus' of git://git.open-osd.org/linux-open-osd:
ore: Support for raid 6
ore: Remove redundant dev_order(), more cleanups
ore: (trivial) reformat some code
Diffstat (limited to 'fs/exofs')
-rw-r--r-- | fs/exofs/Kconfig.ore | 2 | ||||
-rw-r--r-- | fs/exofs/ore.c | 100 | ||||
-rw-r--r-- | fs/exofs/ore_raid.c | 56 | ||||
-rw-r--r-- | fs/exofs/ore_raid.h | 21 |
4 files changed, 98 insertions, 81 deletions
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba8..2daf2329c28d 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore @@ -9,4 +9,6 @@ config ORE tristate depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ default SCSI_OSD_ULD diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index dae884694bd9..cfc0205d62c4 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->parity = 1; break; case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; case PNFS_OSD_RAID_4: default: - ORE_ERR("Only RAID_0/5 for now\n"); + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length /= stripe_length; layout->max_io_length *= stripe_length; } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -545,21 +550,24 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; if (parity) { u32 LCMdP = lcm(group_width, parity) / parity; /* R = N % LCMdP; */ u32 RxP = (N % LCMdP) * parity; - u32 first_dev = C - C % group_width; si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; - si->dev = (group_width + C - RxP) % group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; si->bytes_in_stripe = U; si->first_stripe_start = M * S + G * T + N * U; } else { @@ -649,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance return ret; } +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + static int _prepare_for_striping(struct ore_io_state *ios) { struct ore_striping_info *si = &ios->si; @@ -658,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -670,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); - si->cur_comp = dev_order; - si->cur_pg = si->unit_off / PAGE_SIZE; - while (length) { - unsigned comp = dev - first_dev; - struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; - if (!per_dev->length) { + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ per_dev->dev = dev; if (dev == si->dev) { WARN_ON(dev == si->par_dev); @@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); } else { - if (si->cur_comp > dev_order) - per_dev->offset = - si->obj_offset - si->unit_off; - else /* si->cur_comp < dev_order */ - per_dev->offset = - si->obj_offset + stripe_unit - - si->unit_off; + per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } } else { @@ -708,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (unlikely(ret)) goto out; - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - length -= cur_len; + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; si->cur_comp = (si->cur_comp + 1) % group_width; if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { if (!length && ios->sp2d) { @@ -720,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios) * stripe. then operate on parity dev. */ dev = si->par_dev; - } - if (ios->sp2d) - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - cur_len = length; - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; } - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; @@ -747,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* Next stripe, start fresh */ si->cur_comp = 0; si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; } } out: diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 4e2c032ab8a1..7f20f25c232c 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) { unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + for (p = 0; p < sp2d->pages_in_unit; p++) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; if (!_1ps->write_count) continue; - init_async_submit(&_1ps->submit, - ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, - NULL, - NULL, NULL, - (addr_conv_t *)_1ps->scribble); - - /* TODO: raid6 */ - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, - 0, sp2d->data_devs, PAGE_SIZE, - &_1ps->submit); + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); } for (p = 0; p < sp2d->pages_in_unit; p++) { @@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) ore_calc_stripe_info(ios->layout, *offset, 0, &si); - p = si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, si.par_dev, si.dev); + p = si.cur_pg; + c = si.cur_comp; page = ios->sp2d->_1p_stripes[p].pages[c]; pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); @@ -534,9 +539,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios) goto read_it; ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + p = read_si.cur_pg; + c = read_si.cur_comp; if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ @@ -620,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, - unsigned cur_len) + unsigned cur_len, bool do_xor) { if (ios->reading) { if (per_dev->cur_sg >= ios->sgs_per_dev) { @@ -640,17 +644,16 @@ int _ore_add_parity_unit(struct ore_io_state *ios, si->cur_pg = _sp2d_min_pg(sp2d); num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - if (!cur_len) /* If last stripe operate on parity comp */ - si->cur_comp = sp2d->data_devs; - if (!per_dev->length) { per_dev->offset += si->cur_pg * PAGE_SIZE; /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write_first_stripe(ios); + if (do_xor) + _read_4_write_first_stripe(ios); } - if (!cur_len) /* If last stripe r4w pages of last stripe */ + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ _read_4_write_last_stripe(ios); _read_4_write_execute(ios); @@ -662,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, ++(ios->cur_par_page); } - BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_comp < sp2d->data_devs); BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, @@ -670,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, if (unlikely(ret)) return ret; - /* TODO: raid6 if (last_parity_dev) */ - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } } return 0; } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index 2ffd2c3c6e46..cf6375d82129 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -31,24 +31,6 @@ #define ORE_DBGMSG2(M...) do {} while (0) /* #define ORE_DBGMSG2 ORE_DBGMSG */ -/* Calculate the component order in a stripe. eg the logical data unit - * address within the stripe of @dev given the @par_dev of this stripe. - */ -static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, - unsigned par_dev, unsigned dev) -{ - unsigned first_dev = dev - dev % devs_in_group; - - dev -= first_dev; - par_dev -= first_dev; - - if (devs_in_group == par_dev) /* The raid 0 case */ - return dev / mirrors_p1; - /* raid4/5/6 case */ - return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / - mirrors_p1; -} - /* ios_raid.c stuff needed by ios.c */ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); void _ore_free_raid_stuff(struct ore_io_state *ios); @@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len); + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, struct ore_striping_info *si, struct page *page); static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, |