diff options
| author | Dan Williams <dan.j.williams@intel.com> | 2009-07-14 13:40:19 -0700 | 
|---|---|---|
| committer | Dan Williams <dan.j.williams@intel.com> | 2009-08-29 19:13:12 -0700 | 
| commit | ac6b53b6e6acab27e4f3e2383f9ac1f0d7c6200b (patch) | |
| tree | e9c67502fd32a9eea4902ea382e51eecb09f2a54 /drivers/md/raid5.c | |
| parent | 4e7d2c0aefb77f7b24942e5af042a083be4d60bb (diff) | |
md/raid6: asynchronous raid6 operations
[ Based on an original patch by Yuri Tikhonov ]
The raid_run_ops routine uses the asynchronous offload api and
the stripe_operations member of a stripe_head to carry out xor+pq+copy
operations asynchronously, outside the lock.
The operations performed by RAID-6 are the same as in the RAID-5 case
except for no support of STRIPE_OP_PREXOR operations. All the others
are supported:
STRIPE_OP_BIOFILL
 - copy data into request buffers to satisfy a read request
STRIPE_OP_COMPUTE_BLK
 - generate missing blocks (1 or 2) in the cache from the other blocks
STRIPE_OP_BIODRAIN
 - copy data out of request buffers to satisfy a write request
STRIPE_OP_RECONSTRUCT
 - recalculate parity for new data that has entered the cache
STRIPE_OP_CHECK
 - verify that the parity is correct
The flow is the same as in the RAID-5 case, and reuses some routines, namely:
1/ ops_complete_postxor (renamed to ops_complete_reconstruct)
2/ ops_complete_compute (updated to set up to 2 targets uptodate)
3/ ops_run_check (renamed to ops_run_check_p for xor parity checks)
[neilb@suse.de: fixes to get it to pass mdadm regression suite]
Reviewed-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Yuri Tikhonov <yur@emcraft.com>
Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 322 | 
1 files changed, 292 insertions, 30 deletions
| diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e3a2990bdc7c..e68616ed3e78 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -636,15 +636,16 @@ static void mark_target_uptodate(struct stripe_head *sh, int target)  	clear_bit(R5_Wantcompute, &tgt->flags);  } -static void ops_complete_compute5(void *stripe_head_ref) +static void ops_complete_compute(void *stripe_head_ref)  {  	struct stripe_head *sh = stripe_head_ref;  	pr_debug("%s: stripe %llu\n", __func__,  		(unsigned long long)sh->sector); -	/* mark the computed target as uptodate */ +	/* mark the computed target(s) as uptodate */  	mark_target_uptodate(sh, sh->ops.target); +	mark_target_uptodate(sh, sh->ops.target2);  	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);  	if (sh->check_state == check_state_compute_run) @@ -684,7 +685,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)  	atomic_inc(&sh->count);  	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, -			  ops_complete_compute5, sh, to_addr_conv(sh, percpu)); +			  ops_complete_compute, sh, to_addr_conv(sh, percpu));  	if (unlikely(count == 1))  		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);  	else @@ -693,6 +694,197 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)  	return tx;  } +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome.  The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +{ +	int disks = sh->disks; +	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); +	int d0_idx = raid6_d0(sh); +	int count; +	int i; + +	for (i = 0; i < disks; i++) +		srcs[i] = (void *)raid6_empty_zero_page; + +	count = 0; +	i = d0_idx; +	do { +		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + +		srcs[slot] = sh->dev[i].page; +		i = raid6_next_disk(i, disks); +	} while (i != d0_idx); +	BUG_ON(count != syndrome_disks); + +	return count; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ +	int disks = sh->disks; +	struct page **blocks = percpu->scribble; +	int target; +	int qd_idx = sh->qd_idx; +	struct dma_async_tx_descriptor *tx; +	struct async_submit_ctl submit; +	struct r5dev *tgt; +	struct page *dest; +	int i; +	int count; + +	if (sh->ops.target < 0) +		target = sh->ops.target2; +	else if (sh->ops.target2 < 0) +		target = sh->ops.target; +	else +		/* we should only have one valid target */ +		BUG(); +	BUG_ON(target < 0); +	pr_debug("%s: stripe %llu block: %d\n", +		__func__, (unsigned long long)sh->sector, target); + +	tgt = &sh->dev[target]; +	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); +	dest = tgt->page; + +	atomic_inc(&sh->count); + +	if (target == qd_idx) { +		count = set_syndrome_sources(blocks, sh); +		blocks[count] = NULL; /* regenerating p is not necessary */ +		BUG_ON(blocks[count+1] != dest); /* q should already be set */ +		init_async_submit(&submit, 0, NULL, ops_complete_compute, sh, +				  to_addr_conv(sh, percpu)); +		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); +	} else { +		/* Compute any data- or p-drive using XOR */ +		count = 0; +		for (i = disks; i-- ; ) { +			if (i == target || i == qd_idx) +				continue; +			blocks[count++] = sh->dev[i].page; +		} + +		init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, +				  ops_complete_compute, sh, +				  to_addr_conv(sh, percpu)); +		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); +	} + +	return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ +	int i, count, disks = sh->disks; +	int syndrome_disks = sh->ddf_layout ? disks : disks-2; +	int d0_idx = raid6_d0(sh); +	int faila = -1, failb = -1; +	int target = sh->ops.target; +	int target2 = sh->ops.target2; +	struct r5dev *tgt = &sh->dev[target]; +	struct r5dev *tgt2 = &sh->dev[target2]; +	struct dma_async_tx_descriptor *tx; +	struct page **blocks = percpu->scribble; +	struct async_submit_ctl submit; + +	pr_debug("%s: stripe %llu block1: %d block2: %d\n", +		 __func__, (unsigned long long)sh->sector, target, target2); +	BUG_ON(target < 0 || target2 < 0); +	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); +	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + +	/* we need to open-code set_syndrome_sources to handle to the +	 * slot number conversion for 'faila' and 'failb' +	 */ +	for (i = 0; i < disks ; i++) +		blocks[i] = (void *)raid6_empty_zero_page; +	count = 0; +	i = d0_idx; +	do { +		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + +		blocks[slot] = sh->dev[i].page; + +		if (i == target) +			faila = slot; +		if (i == target2) +			failb = slot; +		i = raid6_next_disk(i, disks); +	} while (i != d0_idx); +	BUG_ON(count != syndrome_disks); + +	BUG_ON(faila == failb); +	if (failb < faila) +		swap(faila, failb); +	pr_debug("%s: stripe: %llu faila: %d failb: %d\n", +		 __func__, (unsigned long long)sh->sector, faila, failb); + +	atomic_inc(&sh->count); + +	if (failb == syndrome_disks+1) { +		/* Q disk is one of the missing disks */ +		if (faila == syndrome_disks) { +			/* Missing P+Q, just recompute */ +			init_async_submit(&submit, 0, NULL, ops_complete_compute, +					  sh, to_addr_conv(sh, percpu)); +			return async_gen_syndrome(blocks, 0, count+2, +						  STRIPE_SIZE, &submit); +		} else { +			struct page *dest; +			int data_target; +			int qd_idx = sh->qd_idx; + +			/* Missing D+Q: recompute D from P, then recompute Q */ +			if (target == qd_idx) +				data_target = target2; +			else +				data_target = target; + +			count = 0; +			for (i = disks; i-- ; ) { +				if (i == data_target || i == qd_idx) +					continue; +				blocks[count++] = sh->dev[i].page; +			} +			dest = sh->dev[data_target].page; +			init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, +					  NULL, NULL, to_addr_conv(sh, percpu)); +			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, +				       &submit); + +			count = set_syndrome_sources(blocks, sh); +			init_async_submit(&submit, 0, tx, ops_complete_compute, +					  sh, to_addr_conv(sh, percpu)); +			return async_gen_syndrome(blocks, 0, count+2, +						  STRIPE_SIZE, &submit); +		} +	} + +	init_async_submit(&submit, 0, NULL, ops_complete_compute, sh, +			  to_addr_conv(sh, percpu)); +	if (failb == syndrome_disks) { +		/* We're missing D+P. */ +		return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, +					       faila, blocks, &submit); +	} else { +		/* We're missing D+D. */ +		return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, +					       faila, failb, blocks, &submit); +	} +} + +  static void ops_complete_prexor(void *stripe_head_ref)  {  	struct stripe_head *sh = stripe_head_ref; @@ -765,17 +957,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)  	return tx;  } -static void ops_complete_postxor(void *stripe_head_ref) +static void ops_complete_reconstruct(void *stripe_head_ref)  {  	struct stripe_head *sh = stripe_head_ref; -	int disks = sh->disks, i, pd_idx = sh->pd_idx; +	int disks = sh->disks; +	int pd_idx = sh->pd_idx; +	int qd_idx = sh->qd_idx; +	int i;  	pr_debug("%s: stripe %llu\n", __func__,  		(unsigned long long)sh->sector);  	for (i = disks; i--; ) {  		struct r5dev *dev = &sh->dev[i]; -		if (dev->written || i == pd_idx) + +		if (dev->written || i == pd_idx || i == qd_idx)  			set_bit(R5_UPTODATE, &dev->flags);  	} @@ -793,8 +989,8 @@ static void ops_complete_postxor(void *stripe_head_ref)  }  static void -ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu, -		struct dma_async_tx_descriptor *tx) +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, +		     struct dma_async_tx_descriptor *tx)  {  	int disks = sh->disks;  	struct page **xor_srcs = percpu->scribble; @@ -837,7 +1033,7 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,  	atomic_inc(&sh->count); -	init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, +	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,  			  to_addr_conv(sh, percpu));  	if (unlikely(count == 1))  		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); @@ -845,6 +1041,25 @@ ops_run_postxor(struct stripe_head *sh, struct raid5_percpu *percpu,  		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);  } +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, +		     struct dma_async_tx_descriptor *tx) +{ +	struct async_submit_ctl submit; +	struct page **blocks = percpu->scribble; +	int count; + +	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + +	count = set_syndrome_sources(blocks, sh); + +	atomic_inc(&sh->count); + +	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, +			  sh, to_addr_conv(sh, percpu)); +	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit); +} +  static void ops_complete_check(void *stripe_head_ref)  {  	struct stripe_head *sh = stripe_head_ref; @@ -857,23 +1072,28 @@ static void ops_complete_check(void *stripe_head_ref)  	release_stripe(sh);  } -static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu) +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)  {  	int disks = sh->disks; +	int pd_idx = sh->pd_idx; +	int qd_idx = sh->qd_idx; +	struct page *xor_dest;  	struct page **xor_srcs = percpu->scribble;  	struct dma_async_tx_descriptor *tx;  	struct async_submit_ctl submit; - -	int count = 0, pd_idx = sh->pd_idx, i; -	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; +	int count; +	int i;  	pr_debug("%s: stripe %llu\n", __func__,  		(unsigned long long)sh->sector); +	count = 0; +	xor_dest = sh->dev[pd_idx].page; +	xor_srcs[count++] = xor_dest;  	for (i = disks; i--; ) { -		struct r5dev *dev = &sh->dev[i]; -		if (i != pd_idx) -			xor_srcs[count++] = dev->page; +		if (i == pd_idx || i == qd_idx) +			continue; +		xor_srcs[count++] = sh->dev[i].page;  	}  	init_async_submit(&submit, 0, NULL, NULL, NULL, @@ -886,11 +1106,32 @@ static void ops_run_check(struct stripe_head *sh, struct raid5_percpu *percpu)  	tx = async_trigger_callback(&submit);  } -static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ +	struct page **srcs = percpu->scribble; +	struct async_submit_ctl submit; +	int count; + +	pr_debug("%s: stripe %llu checkp: %d\n", __func__, +		(unsigned long long)sh->sector, checkp); + +	count = set_syndrome_sources(srcs, sh); +	if (!checkp) +		srcs[count] = NULL; + +	atomic_inc(&sh->count); +	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, +			  sh, to_addr_conv(sh, percpu)); +	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, +			   &sh->ops.zero_sum_result, percpu->spare_page, &submit); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  {  	int overlap_clear = 0, i, disks = sh->disks;  	struct dma_async_tx_descriptor *tx = NULL;  	raid5_conf_t *conf = sh->raid_conf; +	int level = conf->level;  	struct raid5_percpu *percpu;  	unsigned long cpu; @@ -902,9 +1143,16 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)  	}  	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { -		tx = ops_run_compute5(sh, percpu); -		/* terminate the chain if postxor is not set to be run */ -		if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) +		if (level < 6) +			tx = ops_run_compute5(sh, percpu); +		else { +			if (sh->ops.target2 < 0 || sh->ops.target < 0) +				tx = ops_run_compute6_1(sh, percpu); +			else +				tx = ops_run_compute6_2(sh, percpu); +		} +		/* terminate the chain if reconstruct is not set to be run */ +		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))  			async_tx_ack(tx);  	} @@ -916,11 +1164,23 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)  		overlap_clear++;  	} -	if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) -		ops_run_postxor(sh, percpu, tx); +	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { +		if (level < 6) +			ops_run_reconstruct5(sh, percpu, tx); +		else +			ops_run_reconstruct6(sh, percpu, tx); +	} -	if (test_bit(STRIPE_OP_CHECK, &ops_request)) -		ops_run_check(sh, percpu); +	if (test_bit(STRIPE_OP_CHECK, &ops_request)) { +		if (sh->check_state == check_state_run) +			ops_run_check_p(sh, percpu); +		else if (sh->check_state == check_state_run_q) +			ops_run_check_pq(sh, percpu, 0); +		else if (sh->check_state == check_state_run_pq) +			ops_run_check_pq(sh, percpu, 1); +		else +			BUG(); +	}  	if (overlap_clear)  		for (i = disks; i--; ) { @@ -1931,7 +2191,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,  		} else  			sh->reconstruct_state = reconstruct_state_run; -		set_bit(STRIPE_OP_POSTXOR, &s->ops_request); +		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);  		for (i = disks; i--; ) {  			struct r5dev *dev = &sh->dev[i]; @@ -1954,7 +2214,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,  		sh->reconstruct_state = reconstruct_state_prexor_drain_run;  		set_bit(STRIPE_OP_PREXOR, &s->ops_request);  		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); -		set_bit(STRIPE_OP_POSTXOR, &s->ops_request); +		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);  		for (i = disks; i--; ) {  			struct r5dev *dev = &sh->dev[i]; @@ -2206,9 +2466,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,  			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);  			set_bit(R5_Wantcompute, &dev->flags);  			sh->ops.target = disk_idx; +			sh->ops.target2 = -1;  			s->req_compute = 1;  			/* Careful: from this point on 'uptodate' is in the eye -			 * of raid5_run_ops which services 'compute' operations +			 * of raid_run_ops which services 'compute' operations  			 * before writes. R5_Wantcompute flags a block that will  			 * be R5_UPTODATE by the time it is needed for a  			 * subsequent operation. @@ -2435,8 +2696,8 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,  	 */  	/* since handle_stripe can be called at any time we need to handle the  	 * case where a compute block operation has been submitted and then a -	 * subsequent call wants to start a write request.  raid5_run_ops only -	 * handles the case where compute block and postxor are requested +	 * subsequent call wants to start a write request.  raid_run_ops only +	 * handles the case where compute block and reconstruct are requested  	 * simultaneously.  If this is not the case then new writes need to be  	 * held off until the compute completes.  	 */ @@ -2618,6 +2879,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,  				set_bit(R5_Wantcompute,  					&sh->dev[sh->pd_idx].flags);  				sh->ops.target = sh->pd_idx; +				sh->ops.target2 = -1;  				s->uptodate++;  			}  		} @@ -3067,7 +3329,7 @@ static bool handle_stripe5(struct stripe_head *sh)  		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);  	if (s.ops_request) -		raid5_run_ops(sh, s.ops_request); +		raid_run_ops(sh, s.ops_request);  	ops_run_io(sh, &s); | 
