summaryrefslogtreecommitdiff
path: root/drivers/block/drbd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/drbd_actlog.c246
-rw-r--r--drivers/block/drbd/drbd_bitmap.c13
-rw-r--r--drivers/block/drbd/drbd_int.h179
-rw-r--r--drivers/block/drbd/drbd_main.c251
-rw-r--r--drivers/block/drbd/drbd_nl.c200
-rw-r--r--drivers/block/drbd/drbd_proc.c10
-rw-r--r--drivers/block/drbd/drbd_receiver.c16
-rw-r--r--drivers/block/drbd/drbd_req.c192
-rw-r--r--drivers/block/drbd/drbd_req.h8
-rw-r--r--drivers/block/drbd/drbd_state.c28
-rw-r--r--drivers/block/drbd/drbd_strings.c1
-rw-r--r--drivers/block/drbd/drbd_worker.c24
12 files changed, 845 insertions, 323 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 92510f8ad013..6608076dc39e 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
int err;
};
-static int al_write_transaction(struct drbd_conf *mdev);
void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
- if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
+ if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
+ /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+ ;
+ else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
+ /* Corresponding put_ldev in drbd_md_io_complete() */
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
err = -ENODEV;
goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
BUG_ON(!bdev->md_bdev);
- dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+ dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
current->comm, current->pid, __func__,
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (void*)_RET_IP_ );
if (sector < drbd_md_first_sector(bdev) ||
sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+ /* we do all our meta data IO in aligned 4k blocks. */
+ err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
if (err) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
return err;
}
-static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
{
- struct lc_element *al_ext;
struct lc_element *tmp;
- int wake;
-
- spin_lock_irq(&mdev->al_lock);
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
if (unlikely(tmp != NULL)) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
- if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
- wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
- spin_unlock_irq(&mdev->al_lock);
- if (wake)
- wake_up(&mdev->al_wait);
- return NULL;
- }
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+ return bm_ext;
+ }
+ return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
+{
+ struct lc_element *al_ext;
+ struct bm_extent *bm_ext;
+ int wake;
+
+ spin_lock_irq(&mdev->al_lock);
+ bm_ext = find_active_resync_extent(mdev, enr);
+ if (bm_ext) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+ spin_unlock_irq(&mdev->al_lock);
+ if (wake)
+ wake_up(&mdev->al_wait);
+ return NULL;
}
- al_ext = lc_get(mdev->act_log, enr);
+ if (nonblock)
+ al_ext = lc_try_get(mdev->act_log, enr);
+ else
+ al_ext = lc_get(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
return al_ext;
}
-void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
+bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
{
/* for bios crossing activity log extent boundaries,
* we may need to activate two extents in one go */
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
- unsigned enr;
- bool locked = false;
+ D_ASSERT((unsigned)(last - first) <= 1);
+ D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+ /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+ if (first != last)
+ return false;
+
+ return _al_get(mdev, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool need_transaction = false;
D_ASSERT(first <= last);
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
- for (enr = first; enr <= last; enr++)
- wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ wait_event(mdev->al_wait,
+ (al_ext = _al_get(mdev, enr, false)) != NULL);
+ if (al_ext->lc_number != enr)
+ need_transaction = true;
+ }
+ return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
+
+/* When called through generic_make_request(), we must delegate
+ * activity log I/O to the worker thread: a further request
+ * submitted via generic_make_request() within the same task
+ * would be queued on current->bio_list, and would only start
+ * after this function returns (see generic_make_request()).
+ *
+ * However, if we *are* the worker, we must not delegate to ourselves.
+ */
+
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
+{
+ bool locked = false;
+
+ BUG_ON(delegate && current == mdev->tconn->worker.task);
/* Serialize multiple transactions.
* This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
(locked = lc_try_lock_for_transaction(mdev->act_log)));
if (locked) {
- /* drbd_al_write_transaction(mdev,al_ext,enr);
- * recurses into generic_make_request(), which
- * disallows recursion, bios being serialized on the
- * current->bio_tail list now.
- * we have to delegate updates to the activity log
- * to the worker thread. */
-
/* Double check: it may have been committed by someone else,
* while we have been waiting for the lock. */
if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
rcu_read_unlock();
- if (write_al_updates) {
- al_write_transaction(mdev);
- mdev->al_writ_cnt++;
- }
-
+ if (write_al_updates)
+ al_write_transaction(mdev, delegate);
spin_lock_irq(&mdev->al_lock);
/* FIXME
if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
}
}
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
+{
+ BUG_ON(delegate && current == mdev->tconn->worker.task);
+
+ if (drbd_al_begin_io_prepare(mdev, i))
+ drbd_al_begin_io_commit(mdev, delegate);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ struct lru_cache *al = mdev->act_log;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned nr_al_extents;
+ unsigned available_update_slots;
+ unsigned enr;
+
+ D_ASSERT(first <= last);
+
+ nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+ available_update_slots = min(al->nr_elements - al->used,
+ al->max_pending_changes - al->pending_changes);
+
+ /* We want all necessary updates for a given request within the same transaction
+ * We could first check how many updates are *actually* needed,
+ * and use that instead of the worst-case nr_al_extents */
+ if (available_update_slots < nr_al_extents)
+ return -EWOULDBLOCK;
+
+ /* Is resync active in this area? */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *tmp;
+ tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+ return -EBUSY;
+ return -EWOULDBLOCK;
+ }
+ }
+ }
+
+ /* Checkout the refcounts.
+ * Given that we checked for available elements and update slots above,
+ * this has to be successful. */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ al_ext = lc_get_cumulative(mdev->act_log, enr);
+ if (!al_ext)
+ dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
+ }
+ return 0;
+}
+
void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
/* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
+{
+ const unsigned int stripes = mdev->ldev->md.al_stripes;
+ const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
+
+ /* transaction number, modulo on-disk ring buffer wrap around */
+ unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
+
+ /* ... to aligned 4k on disk block */
+ t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+ /* ... to 512 byte sector in activity log */
+ t *= 8;
+
+ /* ... plus offset to the on disk position */
+ return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
+}
+
static int
_al_write_transaction(struct drbd_conf *mdev)
{
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
- sector = mdev->ldev->md.md_offset
- + mdev->ldev->md.al_offset
- + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
+ sector = al_tr_number_to_on_disk_sector(mdev);
crc = crc32c(0, buffer, 4096);
buffer->crc32c = cpu_to_be32(crc);
if (drbd_bm_write_hinted(mdev))
err = -EIO;
- /* drbd_chk_io_error done already */
- else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- err = -EIO;
- drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
- } else {
- /* advance ringbuffer position and transaction counter */
- mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
- mdev->al_tr_number++;
+ else {
+ bool write_al_updates;
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+ if (write_al_updates) {
+ if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+ } else {
+ mdev->al_tr_number++;
+ mdev->al_writ_cnt++;
+ }
+ }
}
drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
/* Calls from worker context (see w_restart_disk_io()) need to write the
transaction directly. Others came through generic_make_request(),
those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_conf *mdev)
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
{
- struct update_al_work al_work;
-
- if (current == mdev->tconn->worker.task)
+ if (delegate) {
+ struct update_al_work al_work;
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+ wait_for_completion(&al_work.event);
+ return al_work.err;
+ } else
return _al_write_transaction(mdev);
-
- init_completion(&al_work.event);
- al_work.w.cb = w_al_write_transaction;
- al_work.w.mdev = mdev;
- drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
- wait_for_completion(&al_work.event);
-
- return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8dc29502dc08..64fbb8385cdc 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
}
}
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+ u64 bitmap_sectors;
+ if (ldev->md.al_offset == 8)
+ bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+ else
+ bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+ return bitmap_sectors << (9 + 3);
+}
+
/*
* make sure the bitmap has enough room for the attached storage,
* if necessary, resize.
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
words = ALIGN(bits, 64) >> LN2_BPL;
if (get_ldev(mdev)) {
- u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
+ u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
put_ldev(mdev);
if (bits > bits_on_disk) {
dev_info(DEV, "bits = %lu\n", bits);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 6b51afa1aae1..f943aacfdad8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -753,13 +753,16 @@ struct drbd_md {
u32 flags;
u32 md_size_sect;
- s32 al_offset; /* signed relative sector offset to al area */
+ s32 al_offset; /* signed relative sector offset to activity log */
s32 bm_offset; /* signed relative sector offset to bitmap */
- /* u32 al_nr_extents; important for restoring the AL
- * is stored into ldev->dc.al_extents, which in turn
- * gets applied to act_log->nr_elements
- */
+ /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ s32 meta_dev_idx;
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u32 al_size_4k; /* cached product of the above */
};
struct drbd_backing_dev {
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */
} send;
};
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+};
+
struct drbd_conf {
struct drbd_tconn *tconn;
int vnr; /* volume number within the connection */
@@ -1009,7 +1020,6 @@ struct drbd_conf {
struct lru_cache *act_log; /* activity log */
unsigned int al_tr_number;
int al_tr_cycle;
- int al_tr_pos; /* position of the next transaction in the journal */
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
@@ -1032,6 +1042,10 @@ struct drbd_conf {
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
unsigned int peer_max_bio_size;
unsigned int local_max_bio_size;
+
+ /* any requests that would block in drbd_make_request()
+ * are deferred to this single-threaded work queue */
+ struct submit_worker submit;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
-extern void drbd_go_diskless(struct drbd_conf *mdev);
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
/* Meta data layout
- We reserve a 128MB Block (4k aligned)
- * either at the end of the backing device
- * or on a separate meta data device. */
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * Variants:
+ * old, indexed fixed size meta data:
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ * end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ * The activity log consists of 4k transaction blocks,
+ * which are written in a ring-buffer, or striped ring-buffer like fashion,
+ * which are writtensize used to be fixed 32kB,
+ * but is about to become configurable.
+ */
-/* The following numbers are sectors */
-/* Allows up to about 3.8TB, so if you want more,
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
* you need to use the "flexible" meta data format. */
-#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
-#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
-#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
-
-/* we do all meta data IO in 4k blocks */
-#define MD_BLOCK_SHIFT 12
-#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
+#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
+#define MD_4kB_SECT 8
+#define MD_32kB_SECT 64
/* One activity log extent represents 4M of storage */
#define AL_EXTENT_SHIFT 22
@@ -1256,7 +1289,6 @@ struct bm_extent {
/* in one sector of the bitmap, we have this many activity_log extents. */
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
-#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1276,16 +1308,18 @@ struct bm_extent {
*/
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
-#define DRBD_MAX_SECTORS_BM \
- ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
-#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
-#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
-#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+ ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
#else
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
/* 16 TB in units of sectors */
#if BITS_PER_LONG == 32
/* adjust by one page worth of bitmap,
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
+extern void do_submit(struct work_struct *ws);
extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
+extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
-static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
}
}
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
-{
- int meta_dev_idx;
-
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- return _drbd_md_first_sector(meta_dev_idx, bdev);
-}
-
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
- int meta_dev_idx;
-
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- return bdev->md.md_offset + MD_AL_OFFSET - 1;
+ return bdev->md.md_offset + MD_4kB_SECT -1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
- return bdev->md.md_offset + bdev->md.md_size_sect;
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
}
}
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
- int meta_dev_idx;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- _drbd_md_first_sector(meta_dev_idx, bdev))
+ drbd_md_first_sector(bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
}
/**
- * drbd_md_ss__() - Return the sector number of our meta data super block
- * @mdev: DRBD device.
+ * drbd_md_ss() - Return the sector number of our meta data super block
* @bdev: Meta data block device.
*/
-static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
{
- int meta_dev_idx;
+ const int meta_dev_idx = bdev->md.meta_dev_idx;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
- rcu_read_unlock();
-
- switch (meta_dev_idx) {
- default: /* external, some index */
- return MD_RESERVED_SECT * meta_dev_idx;
- case DRBD_MD_INDEX_INTERNAL:
- /* with drbd08, internal meta data is always "flexible" */
- case DRBD_MD_INDEX_FLEX_INT:
- /* sizeof(struct md_on_disk_07) == 4k
- * position: last 4k aligned block of 4k size */
- if (!bdev->backing_bdev) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "bdev->backing_bdev==NULL\n");
- dump_stack();
- }
- return 0;
- }
- return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
- - MD_AL_OFFSET;
- case DRBD_MD_INDEX_FLEX_EXT:
+ if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
return 0;
- }
+
+ /* Since drbd08, internal meta data is always "flexible".
+ * position: last 4k aligned block of 4k size */
+ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+ /* external, some index; this is the old fixed size layout */
+ return MD_128MB_SECT * bdev->md.meta_dev_idx;
}
static inline void
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
if (mdev->state.disk == D_DISKLESS)
/* even internal references gone, safe to destroy */
drbd_ldev_destroy(mdev);
- if (mdev->state.disk == D_FAILED)
+ if (mdev->state.disk == D_FAILED) {
/* all application IO references gone. */
- drbd_go_diskless(mdev);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
+ }
wake_up(&mdev->misc_wait);
}
}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 298b868910dc..a5dca6affcbb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,7 +45,7 @@
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
-
+#include <linux/workqueue.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <linux/vmalloc.h>
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void)
idr_for_each_entry(&minors, mdev, i) {
idr_remove(&minors, mdev_to_minor(mdev));
idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ destroy_workqueue(mdev->submit.wq);
del_gendisk(mdev->vdisk);
/* synchronize_rcu(); No other threads running at this point */
kref_put(&mdev->kref, &drbd_minor_destroy);
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref)
kfree(tconn);
}
+int init_submitter(struct drbd_conf *mdev)
+{
+ /* opencoded create_singlethread_workqueue(),
+ * to be able to say "drbd%d", ..., minor */
+ mdev->submit.wq = alloc_workqueue("drbd%u_submit",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
+ if (!mdev->submit.wq)
+ return -ENOMEM;
+
+ INIT_WORK(&mdev->submit.worker, do_submit);
+ spin_lock_init(&mdev->submit.lock);
+ INIT_LIST_HEAD(&mdev->submit.writes);
+ return 0;
+}
+
enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
{
struct drbd_conf *mdev;
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
goto out_idr_remove_minor;
}
+ if (init_submitter(mdev)) {
+ err = ERR_NOMEM;
+ drbd_msg_put_info("unable to create submit workqueue");
+ goto out_idr_remove_vol;
+ }
+
add_disk(disk);
kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
return NO_ERROR;
+out_idr_remove_vol:
+ idr_remove(&tconn->volumes, vnr_got);
out_idr_remove_minor:
idr_remove(&minors, minor_got);
synchronize_rcu();
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ kfree(ldev->disk_conf);
kfree(ldev);
}
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
rcu_read_unlock();
}
+/* aligned 4kByte */
struct meta_data_on_disk {
- u64 la_size; /* last agreed size. */
+ u64 la_size_sect; /* last agreed size. */
u64 uuid[UI_SIZE]; /* UUIDs. */
u64 device_uuid;
u64 reserved_u64_1;
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk {
u32 magic;
u32 md_size_sect;
u32 al_offset; /* offset to this block */
- u32 al_nr_extents; /* important for restoring the AL */
+ u32 al_nr_extents; /* important for restoring the AL (userspace) */
/* `-- act_log->nr_elements <-- ldev->dc.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
u32 la_peer_max_bio_size; /* last peer max_bio_size */
- u32 reserved_u32[3];
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+
+ u8 reserved_u8[4096 - (7*8 + 10*4)];
} __packed;
/**
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
sector_t sector;
int i;
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
del_timer(&mdev->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
if (!buffer)
goto out;
- memset(buffer, 0, 512);
+ memset(buffer, 0, sizeof(*buffer));
- buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+ buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
for (i = UI_CURRENT; i < UI_SIZE; i++)
buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
- D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+ buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
+ buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
+
+ D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
@@ -2910,13 +2947,141 @@ out:
put_ldev(mdev);
}
+static int check_activity_log_stripe_size(struct drbd_conf *mdev,
+ struct meta_data_on_disk *on_disk,
+ struct drbd_md *in_core)
+{
+ u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+ u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+ u64 al_size_4k;
+
+ /* both not set: default to old fixed size activity log */
+ if (al_stripes == 0 && al_stripe_size_4k == 0) {
+ al_stripes = 1;
+ al_stripe_size_4k = MD_32kB_SECT/8;
+ }
+
+ /* some paranoia plausibility checks */
+
+ /* we need both values to be set */
+ if (al_stripes == 0 || al_stripe_size_4k == 0)
+ goto err;
+
+ al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+ /* Upper limit of activity log area, to avoid potential overflow
+ * problems in al_tr_number_to_on_disk_sector(). As right now, more
+ * than 72 * 4k blocks total only increases the amount of history,
+ * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
+ if (al_size_4k > (16 * 1024 * 1024/4))
+ goto err;
+
+ /* Lower limit: we need at least 8 transaction slots (32kB)
+ * to not break existing setups */
+ if (al_size_4k < MD_32kB_SECT/8)
+ goto err;
+
+ in_core->al_stripe_size_4k = al_stripe_size_4k;
+ in_core->al_stripes = al_stripes;
+ in_core->al_size_4k = al_size_4k;
+
+ return 0;
+err:
+ dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+ al_stripes, al_stripe_size_4k);
+ return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+ sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+ struct drbd_md *in_core = &bdev->md;
+ s32 on_disk_al_sect;
+ s32 on_disk_bm_sect;
+
+ /* The on-disk size of the activity log, calculated from offsets, and
+ * the size of the activity log calculated from the stripe settings,
+ * should match.
+ * Though we could relax this a bit: it is ok, if the striped activity log
+ * fits in the available on-disk activity log size.
+ * Right now, that would break how resize is implemented.
+ * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+ * of possible unused padding space in the on disk layout. */
+ if (in_core->al_offset < 0) {
+ if (in_core->bm_offset > in_core->al_offset)
+ goto err;
+ on_disk_al_sect = -in_core->al_offset;
+ on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+ } else {
+ if (in_core->al_offset != MD_4kB_SECT)
+ goto err;
+ if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+ on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+ }
+
+ /* old fixed size meta data is exactly that: fixed. */
+ if (in_core->meta_dev_idx >= 0) {
+ if (in_core->md_size_sect != MD_128MB_SECT
+ || in_core->al_offset != MD_4kB_SECT
+ || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+ || in_core->al_stripes != 1
+ || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+ goto err;
+ }
+
+ if (capacity < in_core->md_size_sect)
+ goto err;
+ if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+ goto err;
+
+ /* should be aligned, and at least 32k */
+ if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+ goto err;
+
+ /* should fit (for now: exactly) into the available on-disk space;
+ * overflow prevention is in check_activity_log_stripe_size() above. */
+ if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ /* again, should be aligned */
+ if (in_core->bm_offset & 7)
+ goto err;
+
+ /* FIXME check for device grow with flex external meta data? */
+
+ /* can the available bitmap space cover the last agreed device size? */
+ if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+ goto err;
+
+ return 0;
+
+err:
+ dev_err(DEV, "meta data offsets don't make sense: idx=%d "
+ "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+ "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+ in_core->meta_dev_idx,
+ in_core->al_stripes, in_core->al_stripe_size_4k,
+ in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+ (unsigned long long)in_core->la_size_sect,
+ (unsigned long long)capacity);
+
+ return -EINVAL;
+}
+
+
/**
* drbd_md_read() - Reads in the meta data super block
* @mdev: DRBD device.
* @bdev: Device from which the meta data should be read in.
*
- * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
* something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @mdev->ldev.
*/
int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
u32 magic, flags;
int i, rv = NO_ERROR;
- if (!get_ldev_if_state(mdev, D_ATTACHING))
- return ERR_IO_MD_DISK;
+ if (mdev->state.disk != D_DISKLESS)
+ return ERR_DISK_CONFIGURED;
buffer = drbd_md_get_buffer(mdev);
if (!buffer)
- goto out;
+ return ERR_NOMEM;
+
+ /* First, figure out where our meta data superblock is located,
+ * and read it. */
+ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is
@@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
rv = ERR_MD_UNCLEAN;
goto err;
}
+
+ rv = ERR_MD_INVALID;
if (magic != DRBD_MD_MAGIC_08) {
if (magic == DRBD_MD_MAGIC_07)
dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
else
dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
- rv = ERR_MD_INVALID;
goto err;
}
- if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
- dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
- be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
- rv = ERR_MD_INVALID;
+
+ if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+ dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
goto err;
}
+
+
+ /* convert to in_core endian */
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+ bdev->md.flags = be32_to_cpu(buffer->flags);
+ bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+ bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+ bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+ bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+ if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
+ goto err;
+ if (check_offsets_and_sizes(mdev, bdev))
+ goto err;
+
if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
- rv = ERR_MD_INVALID;
goto err;
}
if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
- rv = ERR_MD_INVALID;
goto err;
}
- if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
- dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
- be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
- rv = ERR_MD_INVALID;
- goto err;
- }
-
- bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
- for (i = UI_CURRENT; i < UI_SIZE; i++)
- bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
- bdev->md.flags = be32_to_cpu(buffer->flags);
- bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+ rv = NO_ERROR;
spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED) {
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
err:
drbd_md_put_buffer(mdev);
- out:
- put_ldev(mdev);
return rv;
}
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused)
* end up here after a failed attach, before ldev was even assigned.
*/
if (mdev->bitmap && mdev->ldev) {
+ /* An interrupted resync or similar is allowed to recounts bits
+ * while we detach.
+ * Any modifications would not be expected anymore, though.
+ */
if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
- "detach", BM_LOCKED_MASK)) {
+ "detach", BM_LOCKED_TEST_ALLOWED)) {
if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
drbd_md_set_flag(mdev, MDF_FULL_SYNC);
drbd_md_sync(mdev);
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
return 0;
}
-void drbd_go_diskless(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->state.disk == D_FAILED);
- if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
- drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
-}
-
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
* @mdev: DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2af26fc95280..9e3f441e7e84 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -696,37 +696,52 @@ out:
return 0;
}
-/* initializes the md.*_offset members, so we are able to find
- * the on disk meta data */
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * Activity log size used to be fixed 32kB,
+ * but is about to become configurable.
+ */
static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
- int meta_dev_idx;
+ unsigned int al_size_sect = bdev->md.al_size_4k * 8;
- rcu_read_lock();
- meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
- switch (meta_dev_idx) {
+ switch (bdev->md.meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
- bdev->md.md_size_sect = MD_RESERVED_SECT;
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.md_size_sect = MD_128MB_SECT;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_FLEX_EXT:
/* just occupy the full device; unit: sectors */
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
- bdev->md.md_offset = 0;
- bdev->md.al_offset = MD_AL_OFFSET;
- bdev->md.bm_offset = MD_BM_OFFSET;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
break;
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
- bdev->md.al_offset = -MD_AL_SECTORS;
+ bdev->md.al_offset = -al_size_sect;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
/* plus the "drbd meta data super block",
* and the activity log; */
- md_size_sect += MD_BM_OFFSET;
+ md_size_sect += MD_4kB_SECT + al_size_sect;
bdev->md.md_size_sect = md_size_sect;
/* bitmap offset is adjusted by 'super' block size */
- bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
+ bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
break;
}
- rcu_read_unlock();
}
/* input size is expected to be in KB */
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
- sector_t la_size, u_size;
+ sector_t la_size_sect, u_size;
sector_t size;
char ppb[10];
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
prev_first_sect = drbd_md_first_sector(mdev->ldev);
prev_size = mdev->ldev->md.md_size_sect;
- la_size = mdev->ldev->md.la_size_sect;
+ la_size_sect = mdev->ldev->md.la_size_sect;
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
if (rv == dev_size_error)
goto out;
- la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+ la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|| prev_size != mdev->ldev->md.md_size_sect;
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
drbd_md_mark_dirty(mdev);
}
- if (size > la_size)
+ if (size > la_size_sect)
rv = grew;
- if (size < la_size)
+ if (size < la_size_sect)
rv = shrunk;
out:
lc_unlock(mdev->act_log);
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t u_size, int assume_peer_has_space)
{
sector_t p_size = mdev->p_size; /* partner's disk size. */
- sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
sector_t size = 0;
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
if (p_size && m_size) {
size = min_t(sector_t, p_size, m_size);
} else {
- if (la_size) {
- size = la_size;
+ if (la_size_sect) {
+ size = la_size_sect;
if (m_size && m_size < size)
size = m_size;
if (p_size && p_size < size)
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
}
-static void enforce_disk_conf_limits(struct disk_conf *dc)
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
{
- if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
- dc->al_extents = DRBD_AL_EXTENTS_MIN;
- if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
- dc->al_extents = DRBD_AL_EXTENTS_MAX;
+ /* This is limited by 16 bit "slot" numbers,
+ * and by available on-disk context storage.
+ *
+ * Also (u16)~0 is special (denotes a "free" extent).
+ *
+ * One transaction occupies one 4kB on-disk block,
+ * we have n such blocks in the on disk ring buffer,
+ * the "current" transaction may fail (n-1),
+ * and there is 919 slot numbers context information per transaction.
+ *
+ * 72 transaction blocks amounts to more than 2**16 context slots,
+ * so cap there first.
+ */
+ const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+ const unsigned int sufficient_on_disk =
+ (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+ /AL_CONTEXT_PER_TRANSACTION;
+
+ unsigned int al_size_4k = bdev->md.al_size_4k;
+
+ if (al_size_4k > sufficient_on_disk)
+ return max_al_nr;
- if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
- dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+ return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
}
int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (!expect(new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
- enforce_disk_conf_limits(new_disk_conf);
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
+ new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
if (fifo_size != mdev->rs_plan_s->size) {
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- enforce_disk_conf_limits(new_disk_conf);
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
if (!new_plan) {
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+ write_unlock_irq(&global_state_lock);
+ if (retcode != NO_ERROR)
+ goto fail;
+
rcu_read_lock();
nc = rcu_dereference(mdev->tconn->net_conf);
if (nc) {
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
goto fail;
}
- /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
- drbd_md_set_sector_offsets(mdev, nbc);
+ /* Read our meta data super block early.
+ * This also sets other on-disk offsets. */
+ retcode = drbd_md_read(mdev, nbc);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+ new_disk_conf->al_extents = drbd_al_extents_max(nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
- min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
+ min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
if (!get_ldev_if_state(mdev, D_ATTACHING))
goto force_diskless;
- drbd_md_set_sector_offsets(mdev, nbc);
-
if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
}
}
- retcode = drbd_md_read(mdev, nbc);
- if (retcode != NO_ERROR)
- goto force_diskless_dec;
-
if (mdev->state.conn < C_CONNECTED &&
mdev->state.role == R_PRIMARY &&
(mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
return SS_SUCCESS;
case SS_PRIMARY_NOP:
/* Our state checking code wants to see the peer outdated. */
- rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
- pdsk, D_OUTDATED), CS_VERBOSE);
+ rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+ if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+ rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
break;
case SS_CW_FAILED_BY_PEER:
/* The peer probably wants to see us outdated. */
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
- retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
-
- if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
- retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
-
- while (retcode == SS_NEED_CONNECTION) {
- spin_lock_irq(&mdev->tconn->req_lock);
- if (mdev->state.conn < C_CONNECTED)
- retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->tconn->req_lock);
-
- if (retcode != SS_NEED_CONNECTION)
- break;
-
+ /* If we happen to be C_STANDALONE R_SECONDARY, just change to
+ * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
+ * try to start a resync handshake as sync target for full sync.
+ */
+ if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
+ retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
+ "set_n_write from invalidate", BM_LOCKED_MASK))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
- }
drbd_resume_io(mdev);
out:
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
- retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
- if (retcode < SS_SUCCESS) {
- if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
- /* The peer will get a resync upon connect anyways.
- * Just make that into a full resync. */
- retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
- if (retcode >= SS_SUCCESS) {
- if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
- "set_n_write from invalidate_peer",
- BM_LOCKED_SET_ALLOWED))
- retcode = ERR_IO_MD_DISK;
- }
- } else
- retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
- }
+ /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+ * in the bitmap. Otherwise, try to start a resync handshake
+ * as sync source for full sync.
+ */
+ if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
drbd_resume_io(mdev);
out:
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
CS_VERBOSE + CS_WAIT_COMPLETE);
idr_remove(&mdev->tconn->volumes, mdev->vnr);
idr_remove(&minors, mdev_to_minor(mdev));
+ destroy_workqueue(mdev->submit.wq);
del_gendisk(mdev->vdisk);
synchronize_rcu();
kref_put(&mdev->kref, &drbd_minor_destroy);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 928adb815b09..bf31d41dbaad 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
static int drbd_proc_open(struct inode *inode, struct file *file)
{
- if (try_module_get(THIS_MODULE))
- return single_open(file, drbd_seq_show, PDE_DATA(inode));
+ int err;
+
+ if (try_module_get(THIS_MODULE)) {
+ err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+ if (err)
+ module_put(THIS_MODULE);
+ return err;
+ }
return -ENODEV;
}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 83c5ae0ed56b..4222affff488 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev)
err = drbd_send_current_state(mdev);
clear_bit(USE_DEGR_WFC_T, &mdev->flags);
clear_bit(RESIZE_PENDING, &mdev->flags);
+ atomic_set(&mdev->ap_in_flight, 0);
mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
return err;
}
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
- drbd_al_begin_io(mdev, &peer_req->i);
+ drbd_al_begin_io(mdev, &peer_req->i, true);
}
err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
if (hg == -1 && mdev->state.role == R_PRIMARY) {
enum drbd_state_rv rv2;
- drbd_set_role(mdev, R_SECONDARY, 0);
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
* we might be here in C_WF_REPORT_PARAMS which is transient.
* we do not need to wait for the after state change work either. */
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
clear_bit(DISCARD_MY_DATA, &mdev->flags);
- drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+ drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
return 0;
}
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn)
#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
static int drbd_do_auth(struct drbd_tconn *tconn)
{
- dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
- dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+ conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+ conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
return -1;
}
#else
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi)
bool ping_timeout_active = false;
struct net_conf *nc;
int ping_timeo, tcp_cork, ping_int;
+ struct sched_param param = { .sched_priority = 2 };
- current->policy = SCHED_RR; /* Make this a realtime task! */
- current->rt_priority = 2; /* more important than all other tasks */
+ rv = sched_setscheduler(current, SCHED_RR, &param);
+ if (rv < 0)
+ conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2b8303ad63c9..c24379ffd4e3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -34,14 +34,14 @@
static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
/* Update disk stats at start of I/O request */
-static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
{
- const int rw = bio_data_dir(bio);
+ const int rw = bio_data_dir(req->master_bio);
int cpu;
cpu = part_stat_lock();
part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
- part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+ part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
(void) cpu; /* The macro invocations above want the cpu argument, I do not like
the compiler warning about cpu only assigned but never used... */
part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
else
root = &mdev->read_requests;
drbd_remove_request_interval(root, req);
- } else if (!(s & RQ_POSTPONED))
- D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+ }
/* Before we can signal completion to the upper layers,
* we may need to close the current transfer log epoch.
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(req->rq_state & RQ_NET_PENDING);
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
break;
+
+ case QUEUE_AS_DRBD_BARRIER:
+ start_new_tl_epoch(mdev->tconn);
+ mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+ break;
};
return rv;
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev)
bool congested = false;
enum drbd_on_congestion on_congestion;
+ rcu_read_lock();
nc = rcu_dereference(tconn->net_conf);
on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+ rcu_read_unlock();
if (on_congestion == OC_BLOCK ||
tconn->agreed_pro_version < 96)
return;
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req)
struct drbd_conf *mdev = req->w.mdev;
int remote, send_oos;
- rcu_read_lock();
remote = drbd_should_do_remote(mdev->state);
- if (remote) {
- maybe_pull_ahead(mdev);
- remote = drbd_should_do_remote(mdev->state);
- }
send_oos = drbd_should_send_out_of_sync(mdev->state);
- rcu_read_unlock();
/* Need to replicate writes. Unless it is an empty flush,
* which is better mapped to a DRBD P_BARRIER packet,
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req)
/* The only size==0 bios we expect are empty flushes. */
D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
if (remote)
- start_new_tl_epoch(mdev->tconn);
- return 0;
+ _req_mod(req, QUEUE_AS_DRBD_BARRIER);
+ return remote;
}
if (!remote && !send_oos)
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
bio_endio(bio, -EIO);
}
-void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
{
- const int rw = bio_rw(bio);
- struct bio_and_error m = { NULL, };
+ spin_lock(&mdev->submit.lock);
+ list_add_tail(&req->tl_requests, &mdev->submit.writes);
+ spin_unlock(&mdev->submit.lock);
+ queue_work(mdev->submit.wq, &mdev->submit.worker);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+struct drbd_request *
+drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+ const int rw = bio_data_dir(bio);
struct drbd_request *req;
- bool no_remote = false;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
* if user cannot handle io errors, that's not our business. */
dev_err(DEV, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
- return;
+ return ERR_PTR(-ENOMEM);
}
req->start_time = start_time;
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
req->private_bio = NULL;
}
- /* For WRITES going to the local disk, grab a reference on the target
- * extent. This waits for any resync activity in the corresponding
- * resync extent to finish, and, if necessary, pulls in the target
- * extent into the activity log, which involves further disk io because
- * of transactional on-disk meta data updates.
- * Empty flushes don't need to go into the activity log, they can only
- * flush data for pending writes which are already in there. */
+ /* Update disk stats */
+ _drbd_start_io_acct(mdev, req);
+
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
+ drbd_queue_write(mdev, req);
+ return NULL;
+ }
req->rq_state |= RQ_IN_ACT_LOG;
- drbd_al_begin_io(mdev, &req->i);
}
+ return req;
+}
+
+static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
+{
+ const int rw = bio_rw(req->master_bio);
+ struct bio_and_error m = { NULL, };
+ bool no_remote = false;
+
spin_lock_irq(&mdev->tconn->req_lock);
if (rw == WRITE) {
/* This may temporarily give up the req_lock,
* but will re-aquire it before it returns here.
* Needs to be before the check on drbd_suspended() */
complete_conflicting_writes(req);
+ /* no more giving up req_lock from now on! */
+
+ /* check for congestion, and potentially stop sending
+ * full data updates, but start sending "dirty bits" only. */
+ maybe_pull_ahead(mdev);
}
- /* no more giving up req_lock from now on! */
if (drbd_suspended(mdev)) {
/* push back and retry: */
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
goto out;
}
- /* Update disk stats */
- _drbd_start_io_acct(mdev, req, bio);
-
/* We fail READ/READA early, if we can not serve it.
* We must do this before req is registered on any lists.
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
@@ -1137,7 +1158,116 @@ out:
if (m.bio)
complete_master_bio(mdev, &m);
- return;
+}
+
+void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+ struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
+ if (IS_ERR_OR_NULL(req))
+ return;
+ drbd_send_and_submit(mdev, req);
+}
+
+static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
+{
+ struct drbd_request *req, *tmp;
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ const int rw = bio_data_dir(req->master_bio);
+
+ if (rw == WRITE /* rw != WRITE should not even end up here! */
+ && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ if (!drbd_al_begin_io_fastpath(mdev, &req->i))
+ continue;
+
+ req->rq_state |= RQ_IN_ACT_LOG;
+ }
+
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(mdev, req);
+ }
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
+ struct list_head *incoming,
+ struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+ int wake = 0;
+ int err;
+
+ spin_lock_irq(&mdev->al_lock);
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ err = drbd_al_begin_io_nonblock(mdev, &req->i);
+ if (err == -EBUSY)
+ wake = 1;
+ if (err)
+ continue;
+ req->rq_state |= RQ_IN_ACT_LOG;
+ list_move_tail(&req->tl_requests, pending);
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ if (wake)
+ wake_up(&mdev->al_wait);
+
+ return !list_empty(pending);
+}
+
+void do_submit(struct work_struct *ws)
+{
+ struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
+ LIST_HEAD(incoming);
+ LIST_HEAD(pending);
+ struct drbd_request *req, *tmp;
+
+ for (;;) {
+ spin_lock(&mdev->submit.lock);
+ list_splice_tail_init(&mdev->submit.writes, &incoming);
+ spin_unlock(&mdev->submit.lock);
+
+ submit_fast_path(mdev, &incoming);
+ if (list_empty(&incoming))
+ break;
+
+ wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
+ /* Maybe more was queued, while we prepared the transaction?
+ * Try to stuff them into this transaction as well.
+ * Be strictly non-blocking here, no wait_event, we already
+ * have something to commit.
+ * Stop if we don't make any more progres.
+ */
+ for (;;) {
+ LIST_HEAD(more_pending);
+ LIST_HEAD(more_incoming);
+ bool made_progress;
+
+ /* It is ok to look outside the lock,
+ * it's only an optimization anyways */
+ if (list_empty(&mdev->submit.writes))
+ break;
+
+ spin_lock(&mdev->submit.lock);
+ list_splice_tail_init(&mdev->submit.writes, &more_incoming);
+ spin_unlock(&mdev->submit.lock);
+
+ if (list_empty(&more_incoming))
+ break;
+
+ made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
+
+ list_splice_tail_init(&more_pending, &pending);
+ list_splice_tail_init(&more_incoming, &incoming);
+
+ if (!made_progress)
+ break;
+ }
+ drbd_al_begin_io_commit(mdev, false);
+
+ list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(mdev, req);
+ }
+ }
}
void drbd_make_request(struct request_queue *q, struct bio *bio)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index c08d22964d06..978cb1addc98 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -88,6 +88,14 @@ enum drbd_req_event {
QUEUE_FOR_NET_READ,
QUEUE_FOR_SEND_OOS,
+ /* An empty flush is queued as P_BARRIER,
+ * which will cause it to complete "successfully",
+ * even if the local disk flush failed.
+ *
+ * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+ * only see an error if neither local nor remote data is reachable. */
+ QUEUE_AS_DRBD_BARRIER,
+
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 0fe220cfb9e9..90c5be2b1d30 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
mdev->tconn->agreed_pro_version < 88)
rv = SS_NOT_SUPPORTED;
+ else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ ns.pdsk == D_UNKNOWN)
+ rv = SS_NEED_CONNECTION;
+
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
rv = SS_CONNECTED_OUTDATES;
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
&& os.conn < C_WF_REPORT_PARAMS)
rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+ if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+ os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+ rv = SS_OUTDATE_WO_CONN;
+
return rv;
}
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
&drbd_bmio_set_n_write, &abw_start_sync,
"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
- /* We are invalidating our self... */
- if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
- os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
- /* other bitmap operation expected during this phase */
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
- "set_n_write from invalidate", BM_LOCKED_MASK);
-
/* first half of local IO error, failure to attach,
* or administrative detach */
if (os.disk != D_FAILED && ns.disk == D_FAILED) {
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
return SS_CW_FAILED_BY_PEER;
- rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
-
- if (rv == SS_UNKNOWN_ERROR)
- rv = conn_is_valid_transition(tconn, mask, val, 0);
-
- if (rv == SS_SUCCESS)
- rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+ rv = conn_is_valid_transition(tconn, mask, val, 0);
+ if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
+ rv = SS_UNKNOWN_ERROR; /* continue waiting */
return rv;
}
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 9a664bd27404..58e08ff2b2ce 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+ [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 424dc7bdf9b7..891c0ecaa292 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
md_io->done = 1;
wake_up(&mdev->misc_wait);
bio_put(bio);
- put_ldev(mdev);
+ if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+ put_ldev(mdev);
}
/* reads on behalf of the partner,
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
struct drbd_conf *mdev = w->mdev;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_begin_io(mdev, &req->i);
+ drbd_al_begin_io(mdev, &req->i, false);
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
int resync_after;
while (1) {
- if (!odev->ldev)
+ if (!odev->ldev || odev->state.disk == D_DISKLESS)
return 1;
rcu_read_lock();
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
if (resync_after == -1)
return 1;
odev = minor_to_mdev(resync_after);
- if (!expect(odev))
+ if (!odev)
return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
if (o_minor == -1)
return NO_ERROR;
- if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
+ if (o_minor < -1 || o_minor > MINORMASK)
return ERR_RESYNC_AFTER;
/* check for loops */
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
if (odev == mdev)
return ERR_RESYNC_AFTER_CYCLE;
+ /* You are free to depend on diskless, non-existing,
+ * or not yet/no longer existing minors.
+ * We only reject dependency loops.
+ * We cannot follow the dependency chain beyond a detached or
+ * missing minor.
+ */
+ if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+ return NO_ERROR;
+
rcu_read_lock();
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
rcu_read_unlock();
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
clear_bit(B_RS_H_DONE, &mdev->flags);
write_lock_irq(&global_state_lock);
- if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ /* Did some connection breakage or IO error race with us? */
+ if (mdev->state.conn < C_CONNECTED
+ || !get_ldev_if_state(mdev, D_NEGOTIATING)) {
write_unlock_irq(&global_state_lock);
mutex_unlock(mdev->state_mutex);
return;