diff options
Diffstat (limited to 'drivers/block/xen-blkfront.c')
-rw-r--r-- | drivers/block/xen-blkfront.c | 532 |
1 files changed, 432 insertions, 100 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d89ef86220f4..a4660bbee8a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -74,12 +74,30 @@ struct grant { struct blk_shadow { struct blkif_request req; struct request *request; - struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct grant **grants_used; + struct grant **indirect_grants; + struct scatterlist *sg; +}; + +struct split_bio { + struct bio *bio; + atomic_t pending; + int err; }; static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; +/* + * Maximum number of segments in indirect requests, the actual value used by + * the frontend driver is the minimum of this value and the value provided + * by the backend driver. + */ + +static unsigned int xen_blkif_max_segments = 32; +module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); +MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); + #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) /* @@ -98,7 +116,6 @@ struct blkfront_info enum blkif_state connected; int ring_ref; struct blkif_front_ring ring; - struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; @@ -114,6 +131,7 @@ struct blkfront_info unsigned int discard_granularity; unsigned int discard_alignment; unsigned int feature_persistent:1; + unsigned int max_indirect_segments; int is_ready; }; @@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ +#define SEGS_PER_INDIRECT_FRAME \ + (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) +#define INDIRECT_GREFS(_segs) \ + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) + +static int blkfront_setup_indirect(struct blkfront_info *info); + static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; @@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req) struct blkif_request *ring_req; unsigned long id; unsigned int fsect, lsect; - int i, ref; + int i, ref, n; + struct blkif_request_segment_aligned *segments = NULL; /* * Used to store if we are able to queue the request by just using @@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req) grant_ref_t gref_head; struct grant *gnt_list_entry = NULL; struct scatterlist *sg; + int nseg, max_grefs; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; - /* Check if we have enought grants to allocate a requests */ - if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { + max_grefs = info->max_indirect_segments ? + info->max_indirect_segments + + INDIRECT_GREFS(info->max_indirect_segments) : + BLKIF_MAX_SEGMENTS_PER_REQUEST; + + /* Check if we have enough grants to allocate a requests */ + if (info->persistent_gnts_c < max_grefs) { new_persistent_gnts = 1; if (gnttab_alloc_grant_references( - BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, + max_grefs - info->persistent_gnts_c, &gref_head) < 0) { gnttab_request_free_callback( &info->callback, blkif_restart_queue_callback, info, - BLKIF_MAX_SEGMENTS_PER_REQUEST); + max_grefs); return 1; } } else @@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - ring_req->u.rw.id = id; - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - ring_req->operation = info->flush_op; - } - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - /* id, sector_number and handle are set above. */ ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; } else { - ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, - info->sg); - BUG_ON(ring_req->u.rw.nr_segments > - BLKIF_MAX_SEGMENTS_PER_REQUEST); - - for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { + BUG_ON(info->max_indirect_segments == 0 && + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + req->nr_phys_segments > info->max_indirect_segments); + nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + ring_req->u.rw.id = id; + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = nseg; + } else { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + /* + * Ideally we can do an unordered flush-to-disk. In case the + * backend onlysupports barriers, use that. A barrier request + * a superset of FUA, so we can implement it the same + * way. (It's also a FLUSH+FUA, since it is + * guaranteed ordered WRT previous writes.) + */ + ring_req->operation = info->flush_op; + } + ring_req->u.rw.nr_segments = nseg; + } + for_each_sg(info->shadow[id].sg, sg, nseg, i) { fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (i % SEGS_PER_INDIRECT_FRAME == 0)) { + if (segments) + kunmap_atomic(segments); + + n = i / SEGS_PER_INDIRECT_FRAME; + gnt_list_entry = get_grant(&gref_head, info); + info->shadow[id].indirect_grants[n] = gnt_list_entry; + segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + gnt_list_entry = get_grant(&gref_head, info); ref = gnt_list_entry->gref; @@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req) BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic( - pfn_to_page(gnt_list_entry->pfn)); + shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); bvec_data = kmap_atomic(sg_page(sg)); /* @@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req) kunmap_atomic(bvec_data); kunmap_atomic(shared_data); } - - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + n = i % SEGS_PER_INDIRECT_FRAME; + segments[n] = + (struct blkif_request_segment_aligned) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } } + if (segments) + kunmap_atomic(segments); } info->ring.req_prod_pvt++; @@ -542,7 +608,9 @@ wait: flush_requests(info); } -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, + unsigned int physical_sector_size, + unsigned int segments) { struct request_queue *rq; struct blkfront_info *info = gd->private_data; @@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); - blk_queue_max_hw_sectors(rq, 512); + blk_queue_physical_block_size(rq, physical_sector_size); + blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_segments(rq, segments); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s %s\n", + printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", info->gd->disk_name, info->flush_op == BLKIF_OP_WRITE_BARRIER ? "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled" : "disabled", - info->feature_persistent ? "using persistent grants" : ""); + info->feature_flush ? "enabled;" : "disabled;", + "persistent grants:", + info->feature_persistent ? "enabled;" : "disabled;", + "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, - u16 vdisk_info, u16 sector_size) + u16 vdisk_info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->driverfs_dev = &(info->xbdev->dev); set_capacity(gd, capacity); - if (xlvbd_init_blk_queue(gd, sector_size)) { + if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, + info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)) { del_gendisk(gd); goto release; } @@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) { struct grant *persistent_gnt; struct grant *n; + int i, j, segs; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend) } BUG_ON(info->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE; i++) { + /* + * Clear persistent grants present in requests already + * on the shared ring + */ + if (!info->shadow[i].request) + goto free_shadow; + + segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + info->shadow[i].req.u.indirect.nr_segments : + info->shadow[i].req.u.rw.nr_segments; + for (j = 0; j < segs; j++) { + persistent_gnt = info->shadow[i].grants_used[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + + if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + /* + * If this is not an indirect operation don't try to + * free indirect segments + */ + goto free_shadow; + + for (j = 0; j < INDIRECT_GREFS(segs); j++) { + persistent_gnt = info->shadow[i].indirect_grants[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + +free_shadow: + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + } + /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); spin_unlock_irq(&info->io_lock); @@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { int i = 0; - struct bio_vec *bvec; - struct req_iterator iter; - unsigned long flags; + struct scatterlist *sg; char *bvec_data; void *shared_data; - unsigned int offset = 0; + int nseg; + + nseg = s->req.operation == BLKIF_OP_INDIRECT ? + s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; if (bret->operation == BLKIF_OP_READ) { /* @@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * than PAGE_SIZE, we have to keep track of the current offset, * to be sure we are copying the data from the right shared page. */ - rq_for_each_segment(bvec, s->request, iter) { - BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); - if (bvec->bv_offset < offset) - i++; - BUG_ON(i >= s->req.u.rw.nr_segments); + for_each_sg(s->sg, sg, nseg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); shared_data = kmap_atomic( pfn_to_page(s->grants_used[i]->pfn)); - bvec_data = bvec_kmap_irq(bvec, &flags); - memcpy(bvec_data, shared_data + bvec->bv_offset, - bvec->bv_len); - bvec_kunmap_irq(bvec_data, &flags); + bvec_data = kmap_atomic(sg_page(sg)); + memcpy(bvec_data + sg->offset, + shared_data + sg->offset, + sg->length); + kunmap_atomic(bvec_data); kunmap_atomic(shared_data); - offset = bvec->bv_offset + bvec->bv_len; } } /* Add the persistent grant into the list of free grants */ - for (i = 0; i < s->req.u.rw.nr_segments; i++) { + for (i = 0; i < nseg; i++) { list_add(&s->grants_used[i]->node, &info->persistent_gnts); info->persistent_gnts_c++; } + if (s->req.operation == BLKIF_OP_INDIRECT) { + for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); + info->persistent_gnts_c++; + } + } } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev, SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - /* Allocate memory for grants */ - err = fill_grant_buffer(info, BLK_RING_SIZE * - BLKIF_MAX_SEGMENTS_PER_REQUEST); - if (err) - goto fail; - err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); if (err < 0) { free_page((unsigned long)sring); @@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } +/* + * This is a clone of md_trim_bio, used to split a bio into smaller ones + */ +static void trim_bio(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + * This requires adjusting bi_sector, bi_size, and bi_io_vec + */ + int i; + struct bio_vec *bvec; + int sofar = 0; + + size <<= 9; + if (offset == 0 && size == bio->bi_size) + return; + + bio->bi_sector += offset; + bio->bi_size = size; + offset <<= 9; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + while (bio->bi_idx < bio->bi_vcnt && + bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { + /* remove this whole bio_vec */ + offset -= bio->bi_io_vec[bio->bi_idx].bv_len; + bio->bi_idx++; + } + if (bio->bi_idx < bio->bi_vcnt) { + bio->bi_io_vec[bio->bi_idx].bv_offset += offset; + bio->bi_io_vec[bio->bi_idx].bv_len -= offset; + } + /* avoid any complications with bi_idx being non-zero*/ + if (bio->bi_idx) { + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); + bio->bi_vcnt -= bio->bi_idx; + bio->bi_idx = 0; + } + /* Make sure vcnt and last bv are not too big */ + bio_for_each_segment(bvec, bio, i) { + if (sofar + bvec->bv_len > size) + bvec->bv_len = size - sofar; + if (bvec->bv_len == 0) { + bio->bi_vcnt = i; + break; + } + sofar += bvec->bv_len; + } +} + +static void split_bio_end(struct bio *bio, int error) +{ + struct split_bio *split_bio = bio->bi_private; + + if (error) + split_bio->err = error; + + if (atomic_dec_and_test(&split_bio->pending)) { + split_bio->bio->bi_phys_segments = 0; + bio_endio(split_bio->bio, split_bio->err); + kfree(split_bio); + } + bio_put(bio); +} static int blkif_recover(struct blkfront_info *info) { int i; - struct blkif_request *req; + struct request *req, *n; struct blk_shadow *copy; - int j; + int rc; + struct bio *bio, *cloned_bio; + struct bio_list bio_list, merge_bio; + unsigned int segs, offset; + int pending, size; + struct split_bio *split_bio; + struct list_head requests; /* Stage 1: Make a safe copy of the shadow state. */ copy = kmemdup(info->shadow, sizeof(info->shadow), @@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info) info->shadow_free = info->ring.req_prod_pvt; info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Stage 3: Find pending requests and requeue them. */ + rc = blkfront_setup_indirect(info); + if (rc) { + kfree(copy); + return rc; + } + + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; + blk_queue_max_segments(info->rq, segs); + bio_list_init(&bio_list); + INIT_LIST_HEAD(&requests); for (i = 0; i < BLK_RING_SIZE; i++) { /* Not in use? */ if (!copy[i].request) continue; - /* Grab a request slot and copy shadow state into it. */ - req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - *req = copy[i].req; - - /* We get a new request id, and must reset the shadow state. */ - req->u.rw.id = get_id_from_freelist(info); - memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); - - if (req->operation != BLKIF_OP_DISCARD) { - /* Rewrite any grant references invalidated by susp/resume. */ - for (j = 0; j < req->u.rw.nr_segments; j++) - gnttab_grant_foreign_access_ref( - req->u.rw.seg[j].gref, - info->xbdev->otherend_id, - pfn_to_mfn(copy[i].grants_used[j]->pfn), - 0); + /* + * Get the bios in the request so we can re-queue them. + */ + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; } - info->shadow[req->u.rw.id].req = *req; - - info->ring.req_prod_pvt++; + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_put_request(copy[i].request); } kfree(copy); + /* + * Empty the queue, this is important because we might have + * requests in the queue with more segments than what we + * can handle now. + */ + spin_lock_irq(&info->io_lock); + while ((req = blk_fetch_request(info->rq)) != NULL) { + if (req->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + list_add(&req->queuelist, &requests); + continue; + } + merge_bio.head = req->bio; + merge_bio.tail = req->biotail; + bio_list_merge(&bio_list, &merge_bio); + req->bio = NULL; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) + pr_alert("diskcache flush request found!\n"); + __blk_put_request(info->rq, req); + } + spin_unlock_irq(&info->io_lock); + xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Send off requeued requests */ - flush_requests(info); - /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(info); + list_for_each_entry_safe(req, n, &requests, queuelist) { + /* Requeue pending requests (flush or discard) */ + list_del_init(&req->queuelist); + BUG_ON(req->nr_phys_segments > segs); + blk_requeue_request(info->rq, req); + } spin_unlock_irq(&info->io_lock); + while ((bio = bio_list_pop(&bio_list)) != NULL) { + /* Traverse the list of pending bios and re-queue them */ + if (bio_segments(bio) > segs) { + /* + * This bio has more segments than what we can + * handle, we have to split it. + */ + pending = (bio_segments(bio) + segs - 1) / segs; + split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); + BUG_ON(split_bio == NULL); + atomic_set(&split_bio->pending, pending); + split_bio->bio = bio; + for (i = 0; i < pending; i++) { + offset = (i * segs * PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + (unsigned int)(bio->bi_size >> 9) - offset); + cloned_bio = bio_clone(bio, GFP_NOIO); + BUG_ON(cloned_bio == NULL); + trim_bio(cloned_bio, offset, size); + cloned_bio->bi_private = split_bio; + cloned_bio->bi_end_io = split_bio_end; + submit_bio(cloned_bio->bi_rw, cloned_bio); + } + /* + * Now we have to wait for all those smaller bios to + * end, so we can also end the "parent" bio. + */ + continue; + } + /* We don't need to split this bio */ + submit_bio(bio->bi_rw, bio); + } + return 0; } @@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev) blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = talk_to_blkback(dev, info); - if (info->connected == BLKIF_STATE_SUSPENDED && !err) - err = blkif_recover(info); + + /* + * We have to wait for the backend to switch to + * connected state, since we want to read which + * features it supports. + */ return err; } @@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info) kfree(type); } +static int blkfront_setup_indirect(struct blkfront_info *info) +{ + unsigned int indirect_segments, segs; + int err, i; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) { + info->max_indirect_segments = 0; + segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + } else { + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + segs = info->max_indirect_segments; + } + + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + if (err) + goto out_of_memory; + + for (i = 0; i < BLK_RING_SIZE; i++) { + info->shadow[i].grants_used = kzalloc( + sizeof(info->shadow[i].grants_used[0]) * segs, + GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + if (info->max_indirect_segments) + info->shadow[i].indirect_grants = kzalloc( + sizeof(info->shadow[i].indirect_grants[0]) * + INDIRECT_GREFS(segs), + GFP_NOIO); + if ((info->shadow[i].grants_used == NULL) || + (info->shadow[i].sg == NULL) || + (info->max_indirect_segments && + (info->shadow[i].indirect_grants == NULL))) + goto out_of_memory; + sg_init_table(info->shadow[i].sg, segs); + } + + + return 0; + +out_of_memory: + for (i = 0; i < BLK_RING_SIZE; i++) { + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + } + return -ENOMEM; +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; unsigned long sector_size; + unsigned int physical_sector_size; unsigned int binfo; int err; int barrier, flush, discard, persistent; @@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info) set_capacity(info->gd, sectors); revalidate_disk(info->gd); - /* fall through */ + return; case BLKIF_STATE_SUSPENDED: + /* + * If we are recovering from suspension, we need to wait + * for the backend to announce it's features before + * reconnecting, at least we need to know if the backend + * supports indirect descriptors, and how many. + */ + blkif_recover(info); return; default: @@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info) return; } + /* + * physcial-sector-size is a newer field, so old backends may not + * provide this. Assume physical sector size to be the same as + * sector_size in that case. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "physical-sector-size", "%u", &physical_sector_size); + if (err != 1) + physical_sector_size = sector_size; + info->feature_flush = 0; info->flush_op = 0; @@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info) else info->feature_persistent = persistent; - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); + err = blkfront_setup_indirect(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); |