From 5014c311baa2b21384321fa4a9f617a92e3e56f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Sep 2015 16:46:02 -0600 Subject: block: fix bogus compiler warnings in blk-merge.c The compiler can't figure out that bvprv is initialized whenever 'prev' is set to 1 as well. Use a pointer to bvprv instead, setting it to NULL initially, and get rid of the 'prev' tracking. This dumbs it down enough that gcc is happy. Signed-off-by: Jens Axboe --- block/blk-merge.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index d088cffb8105..cce23ba1ae5f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -67,10 +67,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_set *bs) { struct bio *split; - struct bio_vec bv, bvprv; + struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; - int prev = 0; bio_for_each_segment(bv, bio, iter) { sectors += bv.bv_len >> 9; @@ -82,20 +81,20 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (prev && bvec_gap_to_prev(q, &bvprv, bv.bv_offset)) + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; - if (prev && blk_queue_cluster(q)) { + if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) + if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) + if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) goto new_segment; seg_size += bv.bv_len; bvprv = bv; - prev = 1; + bvprvp = &bv; continue; } new_segment: @@ -104,7 +103,7 @@ new_segment: nsegs++; bvprv = bv; - prev = 1; + bvprvp = &bv; seg_size = bv.bv_len; } -- cgit v1.2.3 From 5e7c4274a70aa2d6f485996d0ca1dad52d0039ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Sep 2015 19:28:20 +0300 Subject: block: Check for gaps on front and back merges We are checking for gaps to previous bio_vec, which can only detect back merges gaps. Moreover, at the point where we check for a gap, we don't know if we will attempt a back or a front merge. Thus, check for gap to prev in a back merge attempt and check for a gap to next in a front merge attempt. Signed-off-by: Jens Axboe [sagig: Minor rename change] Signed-off-by: Sagi Grimberg --- block/blk-merge.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index cce23ba1ae5f..d9eddbc189f5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -438,6 +438,8 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + if (req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -456,6 +458,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + + if (req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -482,14 +487,6 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } -static int req_gap_to_prev(struct request *req, struct bio *next) -{ - struct bio *prev = req->biotail; - - return bvec_gap_to_prev(req->q, &prev->bi_io_vec[prev->bi_vcnt - 1], - next->bi_io_vec[0].bv_offset); -} - static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -504,7 +501,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; - if (req_gap_to_prev(req, next->bio)) + if (req_gap_back_merge(req, next->bio)) return 0; /* @@ -712,10 +709,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; - /* Only check gaps if the bio carries data */ - if (bio_has_data(bio) && req_gap_to_prev(rq, bio)) - return false; - return true; } -- cgit v1.2.3 From 7f39add3b08cbbdb99abe50e6d7c342e6800d684 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 11 Sep 2015 09:03:04 -0600 Subject: block: Refuse request/bio merges with gaps in the integrity payload If a driver sets the block queue virtual boundary mask, it means that it cannot handle gaps so we must not allow those in the integrity payload as well. Signed-off-by: Sagi Grimberg Fixed up by me to have duplicate integrity merge functions, depending on whether block integrity is enabled or not. Fixes a compilations issue with CONFIG_BLK_DEV_INTEGRITY unset. Signed-off-by: Jens Axboe --- block/blk-integrity.c | 3 +++ block/blk-merge.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index f548b64be092..75f29cf70188 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -204,6 +204,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, q->limits.max_integrity_segments) return false; + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); diff --git a/block/blk-merge.c b/block/blk-merge.c index d9eddbc189f5..574ea7c0468f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -440,6 +440,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, { if (req_gap_back_merge(req, bio)) return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -461,6 +464,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, if (req_gap_front_merge(req, bio)) return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; -- cgit v1.2.3 From 87a816df537e096d404add543ef47b796906c130 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 8 Sep 2015 09:33:35 -0600 Subject: block: Refuse adding appending a gapped integrity page to a bio This is only theoretical at the moment given that the only subsystems that generate integrity payloads are the block layer itself and the scsi target (which generate well aligned integrity payloads). But when we will expose integrity meta-data to user-space, we'll need to refuse appending a page with a gap (if the queue virtual boundary is set). Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/bio-integrity.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4aecca79374a..14b8faf8b09d 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -140,6 +140,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; + if (bip->bip_vcnt && + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; -- cgit v1.2.3 From 46348456c1791053dcbe5a9e21825b10a3c8a8fb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 3 Sep 2015 19:28:23 +0300 Subject: block: Copy a user iovec if it includes gaps For drivers that don't support gaps in the SG lists handed to them we must bounce (copy the user buffers) and pass a bio that does not include gaps. This doesn't matter for any current user, but will help to allow iser which can't handle gaps to use the block virtual boundary instead of using driver-local bounce buffering when handling SG_IO commands. Signed-off-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-map.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 233841644c9d..f565e11f465a 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,6 +9,24 @@ #include "blk.h" +static bool iovec_gap_to_prv(struct request_queue *q, + struct iovec *prv, struct iovec *cur) +{ + unsigned long prev_end; + + if (!queue_virt_boundary(q)) + return false; + + if (prv->iov_base == NULL && prv->iov_len == 0) + /* prv is not set - don't check */ + return false; + + prev_end = (unsigned long)(prv->iov_base + prv->iov_len); + + return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || + prev_end & queue_virt_boundary(q)); +} + int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct bio *bio; int unaligned = 0; struct iov_iter i; - struct iovec iov; + struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; if (!iter || !iter->count) return -EINVAL; @@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, /* * Keep going so we check length of all segments */ - if (uaddr & queue_dma_alignment(q)) + if ((uaddr & queue_dma_alignment(q)) || + iovec_gap_to_prv(q, &prv, &iov)) unaligned = 1; + + prv.iov_base = iov.iov_base; + prv.iov_len = iov.iov_len; } if (unaligned || (q->dma_pad_mask & iter->count) || map_data) -- cgit v1.2.3 From 6fe810bda0bd9a5d7674fc671fac27b8aa8ec243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 5 Sep 2015 15:47:36 -0400 Subject: block: blkg_destroy_all() should clear q->root_blkg and ->root_rl.blkg While making the root blkg unconditional, ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg") removed the part which clears q->root_blkg and ->root_rl.blkg during q exit. This leaves the two pointers dangling after blkg_destroy_all(). blk-throttle exit path performs blkg traversals and dereferences ->root_blkg and can lead to the following oops. BUG: unable to handle kernel NULL pointer dereference at 0000000000000558 IP: [] __blkg_lookup+0x26/0x70 ... task: ffff88001b4e2580 ti: ffff88001ac0c000 task.ti: ffff88001ac0c000 RIP: 0010:[] [] __blkg_lookup+0x26/0x70 ... Call Trace: [] blk_throtl_drain+0x5a/0x110 [] blkcg_drain_queue+0x18/0x20 [] __blk_drain_queue+0xc0/0x170 [] blk_queue_bypass_start+0x61/0x80 [] blkcg_deactivate_policy+0x39/0x100 [] blk_throtl_exit+0x38/0x50 [] blkcg_exit_queue+0x3e/0x50 [] blk_release_queue+0x1e/0xc0 ... While the bug is a straigh-forward use-after-free bug, it is tricky to reproduce because blkg release is RCU protected and the rest of exit path usually finishes before RCU grace period. This patch fixes the bug by updating blkg_destro_all() to clear q->root_blkg and ->root_rl.blkg. Signed-off-by: Tejun Heo Reported-by: "Richard W.M. Jones" Reported-by: Josh Boyer Link: http://lkml.kernel.org/g/CA+5PVA5rzQ0s4723n5rHBcxQa9t0cW8BPPBekr_9aMRoWt2aYg@mail.gmail.com Fixes: ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg") Cc: stable@vger.kernel.org # v4.2+ Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d6283b3f5db5..9cc48d1d7abb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -387,6 +387,9 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } + + q->root_blkg = NULL; + q->root_rl.blkg = NULL; } /* -- cgit v1.2.3 From 52cc6eead9095e2faf2ec7afc013aa3af1f01ac5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 17 Sep 2015 09:58:38 -0600 Subject: block: blk-merge: fast-clone bio when splitting rw bios biovecs has become immutable since v3.13, so it isn't necessary to allocate biovecs for the new cloned bios, then we can save one extra biovecs allocation/copy, and the allocation is often not fixed-length and a bit more expensive. For example, if the 'max_sectors_kb' of null blk's queue is set as 16(32 sectors) via sysfs just for making more splits, this patch can increase throught about ~70% in the sequential read test over null_blk(direct io, bs: 1M). Cc: Christoph Hellwig Cc: Kent Overstreet Cc: Ming Lin Cc: Dongsu Park Signed-off-by: Ming Lei This fixes a performance regression introduced by commit 54efd50bfd, and allows us to take full advantage of the fact that we have immutable bio_vecs. Hand applied, as it rejected violently with commit 5014c311baa2. Signed-off-by: Jens Axboe --- block/blk-merge.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 574ea7c0468f..c4e9c37f3e38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -66,15 +66,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs) { - struct bio *split; struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; bio_for_each_segment(bv, bio, iter) { - sectors += bv.bv_len >> 9; - - if (sectors > queue_max_sectors(q)) + if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) goto split; /* @@ -95,6 +92,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; bvprvp = &bv; + sectors += bv.bv_len >> 9; continue; } new_segment: @@ -105,21 +103,12 @@ new_segment: bvprv = bv; bvprvp = &bv; seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; } return NULL; split: - split = bio_clone_bioset(bio, GFP_NOIO, bs); - - split->bi_iter.bi_size -= iter.bi_size; - bio->bi_iter = iter; - - if (bio_integrity(bio)) { - bio_integrity_advance(bio, split->bi_iter.bi_size); - bio_integrity_trim(split, 0, bio_sectors(split)); - } - - return split; + return bio_split(bio, sectors, GFP_NOIO, bs); } void blk_queue_split(struct request_queue *q, struct bio **bio, -- cgit v1.2.3 From 994518799930fc363d47cb7cf0d1abed1790bf16 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 18 Sep 2015 00:06:28 +0800 Subject: block: fix bounce_end_io When bio bounce is involved, one new bio and its biovecs are cloned from the comming bio, which can be one fast-cloned bio from upper layer(such as dm). So it is obviously wrong to assume the start index of the coming( original) bio's io vector is zero, which can be any value between 0 and (bi_max_vecs - 1), especially in case of bio split. This patch fixes Fedora's booting oops on i386, often with the following kernel log together: > [ 9.026738] systemd[1]: Switching root. > [ 9.036467] systemd-journald[149]: Received SIGTERM from PID 1 > (systemd). > [ 9.082262] BUG: Bad page state in process kworker/u5:1 pfn:372ac > [ 9.083989] page:f3d32ae0 count:0 mapcount:0 mapping:f2252178 > index:0x16a > [ 9.085755] flags: 0x40020021(locked|lru|mappedtodisk) > [ 9.087284] page dumped because: page still charged to cgroup > [ 9.088772] bad because of flags: > [ 9.089731] flags: 0x21(locked|lru) > [ 9.090818] page->mem_cgroup:f2c3e400 Reported-by: Josh Boyer Tested-by: Adam Williamson Cc: Ming Lin Cc: Mike Snitzer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bounce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index 2c310ea007ee..457ebd57542f 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -128,12 +128,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, *org_vec; int i; + int start = bio_orig->bi_iter.bi_idx; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; + org_vec = bio_orig->bi_io_vec + i + start; + if (bvec->bv_page == org_vec->bv_page) continue; -- cgit v1.2.3