From 6719fb036cea56a5ee9d0ac912ed8c7cabb27f49 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 18 Oct 2010 23:04:07 +0200 Subject: drbd: fix potential data divergence after multiple failures If we get an IO-error during an activity log transaction, if we failed to write the bitmap of the evicted extent, we must not write the transaction itself. If we failed to write the transaction, we must not even submit the corresponding bio, as its extent is not yet marked in the activity log. Otherwise, if this was a disconneted Primary (degraded cluster), which now lost its disk as well, and we later re-attach the same backend storage, we possibly "forget" to resync some parts of the disk that potentially have been changed. On the receiving side, when receiving from a peer with unhealthy disk, checking for pdsk == D_DISKLESS is not enough, we need to set out of sync and do AL transactions for everything pdsk < D_INCONSISTENT on the receiving side. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'drivers/block/drbd/drbd_req.c') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9e91a2545fc8..d26b213dbf15 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -942,12 +942,21 @@ allocate_barrier: if (local) { req->private_bio->bi_bdev = mdev->ldev->backing_bdev; - if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) + /* State may have changed since we grabbed our reference on the + * mdev->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(mdev)) { + if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + put_ldev(mdev); + } else bio_endio(req->private_bio, -EIO); - else - generic_make_request(req->private_bio); } /* we need to plug ALWAYS since we possibly need to kick lo_dev. -- cgit v1.2.3 From fb2c7a10eec051317ff091b2cb2d73c5ecd98c19 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Oct 2010 12:08:13 +0200 Subject: drbd: rate limit an error message If we don't rate limit it, and you happen to log err level messages via serial console, an IO error on a disconnected Primary may cause serious unresponsiveness. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block/drbd/drbd_req.c') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d26b213dbf15..31d04b17c45f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -813,7 +813,8 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) mdev->state.conn >= C_CONNECTED)); if (!(local || remote) && !is_susp(mdev->state)) { - dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); goto fail_free_complete; } -- cgit v1.2.3 From 8825f7c3e5c7b251b49fc594658a96f59417ee16 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 21 Oct 2010 17:21:19 +0200 Subject: drbd: Silenced an assert That assertion's condition needed adjustment for today's semantics Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/drbd/drbd_req.c') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 31d04b17c45f..5c2254853559 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -258,7 +258,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) if (!hlist_unhashed(&req->colision)) hlist_del(&req->colision); else - D_ASSERT((s & RQ_NET_MASK) == 0); + D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); /* for writes we need to do some extra housekeeping */ if (rw == WRITE) -- cgit v1.2.3 From 650789c87f16dcdf1dd0a67ac7461b7537534855 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Aug 2010 10:47:17 +0200 Subject: drbd: Removed checks for REQ_HARDBARRIER on incomming BIOs Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'drivers/block/drbd/drbd_req.c') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5c2254853559..11a75d32a2e2 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1032,20 +1032,6 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio) return 0; } - /* Reject barrier requests if we know the underlying device does - * not support them. - * XXX: Need to get this info from peer as well some how so we - * XXX: reject if EITHER side/data/metadata area does not support them. - * - * because of those XXX, this is not yet enabled, - * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. - */ - if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) { - /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * what we "blindly" assume: */ -- cgit v1.2.3