81 files changed, 1697 insertions, 944 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c58a9a8ea54e..a3fcdca7e6db 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -226,6 +226,7 @@ config BLK_DEV_DM
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select BLK_DEV_DM_BUILTIN
 	select BLK_MQ_STACKING
+	select CRYPTO_LIB_SHA256 if IMA
 	depends on DAX || DAX=n
 	help
 	  Device-mapper is a low level volume manager.  It works by allowing
@@ -299,6 +300,7 @@ config DM_CRYPT
 	select CRYPTO
 	select CRYPTO_CBC
 	select CRYPTO_ESSIV
+	select CRYPTO_LIB_AES
 	select CRYPTO_LIB_MD5 # needed by lmk IV mode
 	help
 	  This device-mapper target allows you to create a device that
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 64bb38c95895..97d9adb0bf96 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1373,6 +1373,14 @@ static CLOSURE_CALLBACK(cached_dev_free)
 
 	mutex_unlock(&bch_register_lock);
 
+	/*
+	 * Wait for any pending sb_write to complete before free.
+	 * The sb_bio is embedded in struct cached_dev, so we must
+	 * ensure no I/O is in progress.
+	 */
+	down(&dc->sb_write_mutex);
+	up(&dc->sb_write_mutex);
+
 	if (dc->sb_disk)
 		folio_put(virt_to_folio(dc->sb_disk));
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 60f7badec91f..26fedf5883ef 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -391,7 +391,7 @@ struct dm_buffer_cache {
 	 */
 	unsigned int num_locks;
 	bool no_sleep;
-	struct buffer_tree trees[];
+	struct buffer_tree trees[] __counted_by(num_locks);
 };
 
 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
@@ -2511,7 +2511,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	}
 
 	num_locks = dm_num_hash_locks();
-	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
+	c = kzalloc_flex(*c, cache.trees, num_locks);
 	if (!c) {
 		r = -ENOMEM;
 		goto bad_client;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 57158c02d096..acd9b179fcb3 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1023,6 +1023,12 @@ static bool cmd_write_lock(struct dm_cache_metadata *cmd)
 			return;			\
 	} while (0)
 
+#define WRITE_LOCK_OR_GOTO(cmd, label)		\
+	do {					\
+		if (!cmd_write_lock((cmd)))	\
+			goto label;		\
+	} while (0)
+
 #define WRITE_UNLOCK(cmd) \
 	up_write(&(cmd)->root_lock)
 
@@ -1714,17 +1720,6 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 	return r;
 }
 
-int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
-{
-	int r;
-
-	READ_LOCK(cmd);
-	r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
-	READ_UNLOCK(cmd);
-
-	return r;
-}
-
 void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
 {
 	WRITE_LOCK_VOID(cmd);
@@ -1791,11 +1786,8 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
 	new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 					 CACHE_MAX_CONCURRENT_LOCKS);
 
-	WRITE_LOCK(cmd);
-	if (cmd->fail_io) {
-		WRITE_UNLOCK(cmd);
-		goto out;
-	}
+	/* cmd_write_lock() already checks fail_io with cmd->root_lock held */
+	WRITE_LOCK_OR_GOTO(cmd, out);
 
 	__destroy_persistent_data_objects(cmd, false);
 	old_bm = cmd->bm;
@@ -1824,3 +1816,12 @@ out:
 
 	return r;
 }
+
+int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result)
+{
+	READ_LOCK(cmd);
+	*result = cmd->clean_when_opened;
+	READ_UNLOCK(cmd);
+
+	return 0;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 5f77890207fe..91f8706b41fd 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -135,17 +135,17 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
  */
 int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
 
-/*
- * Query method.  Are all the blocks in the cache clean?
- */
-int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
-
 int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result);
 int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
 void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
 void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
 int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
 
+/*
+ * Query method.  Was the metadata cleanly shut down when opened?
+ */
+int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result);
+
 /*----------------------------------------------------------------*/
 
 #endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index b328d9601046..dd77a93fd68d 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1589,14 +1589,18 @@ static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+	unsigned long flags;
 
 	if (!e->allocated)
 		return -ENODATA;
 
+	spin_lock_irqsave(&mq->lock, flags);
 	// FIXME: what if this block has pending background work?
 	del_queue(mq, e);
 	h_remove(&mq->table, e);
 	free_entry(&mq->cache_alloc, e);
+	spin_unlock_irqrestore(&mq->lock, flags);
+
 	return 0;
 }
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 935ab79b1d0c..097315a9bf0f 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1462,11 +1462,19 @@ static void invalidate_complete(struct dm_cache_migration *mg, bool success)
 	struct cache *cache = mg->cache;
 
 	bio_list_init(&bios);
-	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
-		free_prison_cell(cache, mg->cell);
+	if (mg->cell) {
+		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+			free_prison_cell(cache, mg->cell);
+	}
 
-	if (!success && mg->overwrite_bio)
-		bio_io_error(mg->overwrite_bio);
+	if (mg->overwrite_bio) {
+		// Set generic error if the bio hasn't been issued yet,
+		// e.g., invalidation or metadata commit failed before bio
+		// submission. Otherwise preserve the bio's own error status.
+		if (!success && !mg->overwrite_bio->bi_status)
+			mg->overwrite_bio->bi_status = BLK_STS_IOERR;
+		bio_endio(mg->overwrite_bio);
+	}
 
 	free_migration(mg);
 	defer_bios(cache, &bios);
@@ -1506,6 +1514,24 @@ static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
 	return r;
 }
 
+static void invalidate_committed(struct work_struct *ws)
+{
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	struct cache *cache = mg->cache;
+	struct bio *bio = mg->overwrite_bio;
+	struct per_bio_data *pb = get_per_bio_data(bio);
+
+	if (mg->k.input) {
+		invalidate_complete(mg, false);
+		return;
+	}
+
+	init_continuation(&mg->k, invalidate_completed);
+	remap_to_origin_clear_discard(cache, bio, mg->invalidate_oblock);
+	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+	dm_submit_bio_remap(bio, NULL);
+}
+
 static void invalidate_remove(struct work_struct *ws)
 {
 	int r;
@@ -1518,10 +1544,8 @@ static void invalidate_remove(struct work_struct *ws)
 		return;
 	}
 
-	init_continuation(&mg->k, invalidate_completed);
+	init_continuation(&mg->k, invalidate_committed);
 	continue_after_commit(&cache->committer, &mg->k);
-	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
-	mg->overwrite_bio = NULL;
 	schedule_commit(&cache->committer);
 }
 
@@ -1539,6 +1563,15 @@ static int invalidate_lock(struct dm_cache_migration *mg)
 			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
 	if (r < 0) {
 		free_prison_cell(cache, prealloc);
+
+		/* Defer the bio for retrying the cell lock */
+		if (mg->overwrite_bio) {
+			struct bio *bio = mg->overwrite_bio;
+
+			mg->overwrite_bio = NULL;
+			defer_bio(cache, bio);
+		}
+
 		invalidate_complete(mg, false);
 		return r;
 	}
@@ -1701,6 +1734,7 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
 				bio_drop_shared_lock(cache, bio);
 				atomic_inc(&cache->stats.demotion);
 				invalidate_start(cache, cblock, block, bio);
+				return DM_MAPIO_SUBMITTED;
 			} else
 				remap_to_origin_clear_discard(cache, bio, block);
 		} else {
@@ -2467,23 +2501,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 
-	if (passthrough_mode(cache)) {
-		bool all_clean;
-
-		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
-		if (r) {
-			*error = "dm_cache_metadata_all_clean() failed";
-			goto bad;
-		}
-
-		if (!all_clean) {
-			*error = "Cannot enter passthrough mode unless all blocks are clean";
-			r = -EINVAL;
-			goto bad;
-		}
-
+	if (passthrough_mode(cache))
 		policy_allow_migrations(cache->policy, false);
-	}
 
 	spin_lock_init(&cache->lock);
 	bio_list_init(&cache->deferred_bios);
@@ -2810,6 +2829,12 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 	struct cache *cache = context;
 
 	if (dirty) {
+		if (passthrough_mode(cache)) {
+			DMERR("%s: cannot enter passthrough mode unless all blocks are clean",
+			      cache_device_name(cache));
+			return -EBUSY;
+		}
+
 		set_bit(from_cblock(cblock), cache->dirty_bitset);
 		atomic_inc(&cache->nr_dirty);
 	} else
@@ -2929,6 +2954,9 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
 
 static bool can_resume(struct cache *cache)
 {
+	bool clean_when_opened;
+	int r;
+
 	/*
 	 * Disallow retrying the resume operation for devices that failed the
 	 * first resume attempt, as the failure leaves the policy object partially
@@ -2945,6 +2973,20 @@ static bool can_resume(struct cache *cache)
 		return false;
 	}
 
+	if (passthrough_mode(cache)) {
+		r = dm_cache_metadata_clean_when_opened(cache->cmd, &clean_when_opened);
+		if (r) {
+			DMERR("%s: failed to query metadata flags", cache_device_name(cache));
+			return false;
+		}
+
+		if (!clean_when_opened) {
+			DMERR("%s: unable to resume into passthrough mode after unclean shutdown",
+			      cache_device_name(cache));
+			return false;
+		}
+	}
+
 	return true;
 }
 
@@ -3043,7 +3085,7 @@ static int cache_preresume(struct dm_target *ti)
 					   load_filtered_mapping, cache);
 		if (r) {
 			DMERR("%s: could not load cache mappings", cache_device_name(cache));
-			if (r != -EFBIG)
+			if (r != -EFBIG && r != -EBUSY)
 				metadata_operation_failed(cache, "dm_cache_load_mappings", r);
 			return r;
 		}
@@ -3510,7 +3552,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {2, 3, 0},
+	.version = {2, 4, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 54823341c9fd..608b617fb817 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -32,6 +32,7 @@
 #include <linux/ctype.h>
 #include <asm/page.h>
 #include <linux/unaligned.h>
+#include <crypto/aes.h>
 #include <crypto/hash.h>
 #include <crypto/md5.h>
 #include <crypto/skcipher.h>
@@ -109,11 +110,11 @@ struct crypt_iv_operations {
 		   const char *opts);
 	void (*dtr)(struct crypt_config *cc);
 	int (*init)(struct crypt_config *cc);
-	int (*wipe)(struct crypt_config *cc);
+	void (*wipe)(struct crypt_config *cc);
 	int (*generator)(struct crypt_config *cc, u8 *iv,
 			 struct dm_crypt_request *dmreq);
-	int (*post)(struct crypt_config *cc, u8 *iv,
-		    struct dm_crypt_request *dmreq);
+	void (*post)(struct crypt_config *cc, u8 *iv,
+		     struct dm_crypt_request *dmreq);
 };
 
 struct iv_benbi_private {
@@ -133,7 +134,7 @@ struct iv_tcw_private {
 
 #define ELEPHANT_MAX_KEY_SIZE 32
 struct iv_elephant_private {
-	struct crypto_skcipher *tfm;
+	struct aes_enckey *key;
 };
 
 /*
@@ -507,14 +508,12 @@ static int crypt_iv_lmk_init(struct crypt_config *cc)
 	return 0;
 }
 
-static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+static void crypt_iv_lmk_wipe(struct crypt_config *cc)
 {
 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
 
 	if (lmk->seed)
 		memset(lmk->seed, 0, LMK_SEED_SIZE);
-
-	return 0;
 }
 
 static void crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
@@ -560,14 +559,14 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 	return 0;
 }
 
-static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
-			     struct dm_crypt_request *dmreq)
+static void crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	struct scatterlist *sg;
 	u8 *dst;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
-		return 0;
+		return;
 
 	sg = crypt_get_sg_data(cc, dmreq->sg_out);
 	dst = kmap_local_page(sg_page(sg));
@@ -577,7 +576,6 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
 	crypto_xor(dst + sg->offset, iv, cc->iv_size);
 
 	kunmap_local(dst);
-	return 0;
 }
 
 static void crypt_iv_tcw_dtr(struct crypt_config *cc)
@@ -628,14 +626,12 @@ static int crypt_iv_tcw_init(struct crypt_config *cc)
 	return 0;
 }
 
-static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+static void crypt_iv_tcw_wipe(struct crypt_config *cc)
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 
 	memset(tcw->iv_seed, 0, cc->iv_size);
 	memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
-
-	return 0;
 }
 
 static void crypt_iv_tcw_whitening(struct crypt_config *cc,
@@ -687,22 +683,20 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 	return 0;
 }
 
-static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
-			     struct dm_crypt_request *dmreq)
+static void crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	struct scatterlist *sg;
 	u8 *dst;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
-		return 0;
+		return;
 
 	/* Apply whitening on ciphertext */
 	sg = crypt_get_sg_data(cc, dmreq->sg_out);
 	dst = kmap_local_page(sg_page(sg));
 	crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
 	kunmap_local(dst);
-
-	return 0;
 }
 
 static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
@@ -767,8 +761,8 @@ static void crypt_iv_elephant_dtr(struct crypt_config *cc)
 {
 	struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
 
-	crypto_free_skcipher(elephant->tfm);
-	elephant->tfm = NULL;
+	kfree_sensitive(elephant->key);
+	elephant->key = NULL;
 }
 
 static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -777,13 +771,9 @@ static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
 	int r;
 
-	elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0,
-					      CRYPTO_ALG_ALLOCATES_MEMORY);
-	if (IS_ERR(elephant->tfm)) {
-		r = PTR_ERR(elephant->tfm);
-		elephant->tfm = NULL;
-		return r;
-	}
+	elephant->key = kmalloc_obj(*elephant->key);
+	if (!elephant->key)
+		return -ENOMEM;
 
 	r = crypt_iv_eboiv_ctr(cc, ti, NULL);
 	if (r)
@@ -935,41 +925,28 @@ static void diffuser_b_encrypt(u32 *d, size_t n)
 	}
 }
 
-static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
+static void crypt_iv_elephant(struct crypt_config *cc,
+			      struct dm_crypt_request *dmreq)
 {
 	struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
-	u8 *es, *ks, *data, *data2, *data_offset;
-	struct skcipher_request *req;
-	struct scatterlist *sg, *sg2, src, dst;
-	DECLARE_CRYPTO_WAIT(wait);
-	int i, r;
-
-	req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
-	es = kzalloc(16, GFP_NOIO); /* Key for AES */
-	ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
-
-	if (!req || !es || !ks) {
-		r = -ENOMEM;
-		goto out;
-	}
+	u8 *data, *data2, *data_offset;
+	struct scatterlist *sg, *sg2;
+	union {
+		__le64 w[2];
+		u8 b[16];
+	} es;
+	u8 ks[32] __aligned(__alignof(long)); /* Elephant sector key */
+	int i;
 
-	*(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+	es.w[0] = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+	es.w[1] = 0;
 
 	/* E(Ks, e(s)) */
-	sg_init_one(&src, es, 16);
-	sg_init_one(&dst, ks, 16);
-	skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
-	skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
-	r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-	if (r)
-		goto out;
+	aes_encrypt(elephant->key, &ks[0], es.b);
 
 	/* E(Ks, e'(s)) */
-	es[15] = 0x80;
-	sg_init_one(&dst, &ks[16], 16);
-	r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-	if (r)
-		goto out;
+	es.b[15] = 0x80;
+	aes_encrypt(elephant->key, &ks[16], es.b);
 
 	sg = crypt_get_sg_data(cc, dmreq->sg_out);
 	data = kmap_local_page(sg_page(sg));
@@ -1001,34 +978,24 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d
 	}
 
 	kunmap_local(data);
-out:
-	kfree_sensitive(ks);
-	kfree_sensitive(es);
-	skcipher_request_free(req);
-	return r;
+	memzero_explicit(ks, sizeof(ks));
+	memzero_explicit(&es, sizeof(es));
 }
 
 static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
 			    struct dm_crypt_request *dmreq)
 {
-	int r;
-
-	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-		r = crypt_iv_elephant(cc, dmreq);
-		if (r)
-			return r;
-	}
+	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+		crypt_iv_elephant(cc, dmreq);
 
 	return crypt_iv_eboiv_gen(cc, iv, dmreq);
 }
 
-static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
-				  struct dm_crypt_request *dmreq)
+static void crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
+				   struct dm_crypt_request *dmreq)
 {
 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
-		return crypt_iv_elephant(cc, dmreq);
-
-	return 0;
+		crypt_iv_elephant(cc, dmreq);
 }
 
 static int crypt_iv_elephant_init(struct crypt_config *cc)
@@ -1036,16 +1003,14 @@ static int crypt_iv_elephant_init(struct crypt_config *cc)
 	struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
 	int key_offset = cc->key_size - cc->key_extra_size;
 
-	return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
+	return aes_prepareenckey(elephant->key, &cc->key[key_offset], cc->key_extra_size);
 }
 
-static int crypt_iv_elephant_wipe(struct crypt_config *cc)
+static void crypt_iv_elephant_wipe(struct crypt_config *cc)
 {
 	struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
-	u8 key[ELEPHANT_MAX_KEY_SIZE];
 
-	memset(key, 0, cc->key_extra_size);
-	return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
+	memzero_explicit(elephant->key, sizeof(*elephant->key));
 }
 
 static const struct crypt_iv_operations crypt_iv_plain_ops = {
@@ -1376,7 +1341,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	}
 
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
-		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+		cc->iv_gen_ops->post(cc, org_iv, dmreq);
 
 	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
 	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
@@ -1453,7 +1418,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 		r = crypto_skcipher_decrypt(req);
 
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
-		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+		cc->iv_gen_ops->post(cc, org_iv, dmreq);
 
 	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
 	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
@@ -2217,7 +2182,7 @@ static void kcryptd_async_done(void *data, int error)
 	}
 
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
-		error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
+		cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
 
 	if (error == -EBADMSG) {
 		sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq));
@@ -2673,11 +2638,8 @@ static int crypt_wipe_key(struct crypt_config *cc)
 	get_random_bytes(&cc->key, cc->key_size);
 
 	/* Wipe IV private keys */
-	if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
-		r = cc->iv_gen_ops->wipe(cc);
-		if (r)
-			return r;
-	}
+	if (cc->iv_gen_ops && cc->iv_gen_ops->wipe)
+		cc->iv_gen_ops->wipe(cc);
 
 	kfree_sensitive(cc->key_string);
 	cc->key_string = NULL;
@@ -3717,11 +3679,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct crypt_config *cc = ti->private;
 
-	limits->logical_block_size =
-		max_t(unsigned int, limits->logical_block_size, cc->sector_size);
-	limits->physical_block_size =
-		max_t(unsigned int, limits->physical_block_size, cc->sector_size);
-	limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size);
+	dm_stack_bs_limits(limits, cc->sector_size);
 	limits->dma_alignment = limits->logical_block_size - 1;
 
 	/*
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
index efb3cd4f9cd4..9495ca035056 100644
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -12,9 +12,7 @@
 
 #include <linux/ima.h>
 #include <linux/sched/mm.h>
-#include <crypto/hash.h>
-#include <linux/crypto.h>
-#include <crypto/hash_info.h>
+#include <crypto/sha2.h>
 
 #define DM_MSG_PREFIX "ima"
 
@@ -178,19 +176,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	size_t device_data_buf_len, target_metadata_buf_len, target_data_buf_len, l = 0;
 	char *target_metadata_buf = NULL, *target_data_buf = NULL, *digest_buf = NULL;
 	char *ima_buf = NULL, *device_data_buf = NULL;
-	int digest_size, last_target_measured = -1, r;
+	int last_target_measured = -1;
 	status_type_t type = STATUSTYPE_IMA;
 	size_t cur_total_buf_len = 0;
 	unsigned int num_targets, i;
-	SHASH_DESC_ON_STACK(shash, NULL);
-	struct crypto_shash *tfm = NULL;
-	u8 *digest = NULL;
+	struct sha256_ctx hash_ctx;
+	u8 digest[SHA256_DIGEST_SIZE];
 	bool noio = false;
-	/*
-	 * In below hash_alg_prefix_len assignment +1 is for the additional char (':'),
-	 * when prefixing the hash value with the hash algorithm name. e.g. sha256:<hash_value>.
-	 */
-	const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1;
 	char table_load_event_name[] = "dm_table_load";
 
 	ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio);
@@ -210,19 +202,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio))
 		goto error;
 
-	tfm = crypto_alloc_shash(DM_IMA_TABLE_HASH_ALG, 0, 0);
-	if (IS_ERR(tfm))
-		goto error;
-
-	shash->tfm = tfm;
-	digest_size = crypto_shash_digestsize(tfm);
-	digest = dm_ima_alloc(digest_size, noio);
-	if (!digest)
-		goto error;
-
-	r = crypto_shash_init(shash);
-	if (r)
-		goto error;
+	sha256_init(&hash_ctx);
 
 	memcpy(ima_buf + l, DM_IMA_VERSION_STR, table->md->ima.dm_version_str_len);
 	l += table->md->ima.dm_version_str_len;
@@ -270,9 +250,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 		 */
 		if (unlikely(cur_total_buf_len >= DM_IMA_MEASUREMENT_BUF_LEN)) {
 			dm_ima_measure_data(table_load_event_name, ima_buf, l, noio);
-			r = crypto_shash_update(shash, (const u8 *)ima_buf, l);
-			if (r < 0)
-				goto error;
+			sha256_update(&hash_ctx, (const u8 *)ima_buf, l);
 
 			memset(ima_buf, 0, DM_IMA_MEASUREMENT_BUF_LEN);
 			l = 0;
@@ -311,9 +289,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	if (!last_target_measured) {
 		dm_ima_measure_data(table_load_event_name, ima_buf, l, noio);
 
-		r = crypto_shash_update(shash, (const u8 *)ima_buf, l);
-		if (r < 0)
-			goto error;
+		sha256_update(&hash_ctx, (const u8 *)ima_buf, l);
 	}
 
 	/*
@@ -321,20 +297,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	 * so that the table data can be verified against the future device state change
 	 * events, e.g. resume, rename, remove, table-clear etc.
 	 */
-	r = crypto_shash_final(shash, digest);
-	if (r < 0)
-		goto error;
-
-	digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio);
+	sha256_final(&hash_ctx, digest);
 
+	digest_buf = kasprintf(GFP_KERNEL, "sha256:%*phN", SHA256_DIGEST_SIZE,
+			       digest);
 	if (!digest_buf)
 		goto error;
 
-	snprintf(digest_buf, hash_alg_prefix_len + 1, "%s:", DM_IMA_TABLE_HASH_ALG);
-
-	for (i = 0; i < digest_size; i++)
-		snprintf((digest_buf + hash_alg_prefix_len + (i*2)), 3, "%02x", digest[i]);
-
 	if (table->md->ima.active_table.hash != table->md->ima.inactive_table.hash)
 		kfree(table->md->ima.inactive_table.hash);
 
@@ -354,9 +323,6 @@ error:
 	kfree(digest_buf);
 	kfree(device_data_buf);
 exit:
-	kfree(digest);
-	if (tfm)
-		crypto_free_shash(tfm);
 	kfree(ima_buf);
 	kfree(target_metadata_buf);
 	kfree(target_data_buf);
diff --git a/drivers/md/dm-ima.h b/drivers/md/dm-ima.h
index 568870a1a145..a403deca6093 100644
--- a/drivers/md/dm-ima.h
+++ b/drivers/md/dm-ima.h
@@ -15,7 +15,6 @@
 #define DM_IMA_TARGET_METADATA_BUF_LEN	128
 #define DM_IMA_TARGET_DATA_BUF_LEN	2048
 #define DM_IMA_DEVICE_CAPACITY_BUF_LEN	128
-#define DM_IMA_TABLE_HASH_ALG		"sha256"
 
 #define __dm_ima_stringify(s) #s
 #define __dm_ima_str(s) __dm_ima_stringify(s)
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 7403823384c5..c1bacba92c65 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -303,8 +303,10 @@ static int __init dm_init_init(void)
 		}
 	}
 
-	if (waitfor[0])
+	if (waitfor[0]) {
+		wait_for_device_probe();
 		DMINFO("all devices available");
+	}
 
 	list_for_each_entry(dev, &devices, list) {
 		if (dm_early_create(&dev->dmi, dev->table,
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 06e805902151..65c30dec8222 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4046,13 +4046,9 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
 {
 	struct dm_integrity_c *ic = ti->private;
 
-	if (ic->sectors_per_block > 1) {
-		limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
-		limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
-		limits->io_min = ic->sectors_per_block << SECTOR_SHIFT;
-		limits->dma_alignment = limits->logical_block_size - 1;
-		limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT;
-	}
+	dm_stack_bs_limits(limits, ic->sectors_per_block << SECTOR_SHIFT);
+	limits->dma_alignment = limits->logical_block_size - 1;
+	limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT;
 
 	if (!ic->internal_hash) {
 		struct blk_integrity *bi = &limits->integrity;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3ab8b4beff86..a529174c94cf 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -64,7 +64,11 @@ struct vers_iter {
 static struct rb_root name_rb_tree = RB_ROOT;
 static struct rb_root uuid_rb_tree = RB_ROOT;
 
-static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
+#define DM_REMOVE_KEEP_OPEN_DEVICES	1
+#define DM_REMOVE_MARK_DEFERRED		2
+#define DM_REMOVE_ONLY_DEFERRED		4
+#define DM_REMOVE_INTERRUPTIBLE		8
+static int dm_hash_remove_all(unsigned flags);
 
 /*
  * Guards access to both hash tables.
@@ -78,7 +82,7 @@ static DEFINE_MUTEX(dm_hash_cells_mutex);
 
 static void dm_hash_exit(void)
 {
-	dm_hash_remove_all(false, false, false);
+	dm_hash_remove_all(0);
 }
 
 /*
@@ -333,7 +337,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
 	return table;
 }
 
-static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
+static int dm_hash_remove_all(unsigned flags)
 {
 	int dev_skipped;
 	struct rb_node *n;
@@ -347,12 +351,17 @@ retry:
 	down_write(&_hash_lock);
 
 	for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
+		if (flags & DM_REMOVE_INTERRUPTIBLE && fatal_signal_pending(current)) {
+			up_write(&_hash_lock);
+			return -EINTR;
+		}
+
 		hc = container_of(n, struct hash_cell, name_node);
 		md = hc->md;
 		dm_get(md);
 
-		if (keep_open_devices &&
-		    dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
+		if (flags & DM_REMOVE_KEEP_OPEN_DEVICES &&
+		    dm_lock_for_deletion(md, !!(flags & DM_REMOVE_MARK_DEFERRED), !!(flags & DM_REMOVE_ONLY_DEFERRED))) {
 			dm_put(md);
 			dev_skipped++;
 			continue;
@@ -368,7 +377,7 @@ retry:
 		}
 		dm_ima_measure_on_device_remove(md, true);
 		dm_put(md);
-		if (likely(keep_open_devices))
+		if (likely(flags & DM_REMOVE_KEEP_OPEN_DEVICES))
 			dm_destroy(md);
 		else
 			dm_destroy_immediate(md);
@@ -384,8 +393,10 @@ retry:
 
 	up_write(&_hash_lock);
 
-	if (dev_skipped)
+	if (dev_skipped && !(flags & DM_REMOVE_ONLY_DEFERRED))
 		DMWARN("remove_all left %d open device(s)", dev_skipped);
+
+	return 0;
 }
 
 /*
@@ -513,7 +524,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 
 void dm_deferred_remove(void)
 {
-	dm_hash_remove_all(true, false, true);
+	dm_hash_remove_all(DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_ONLY_DEFERRED);
 }
 
 /*
@@ -529,9 +540,13 @@ typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_
 
 static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size)
 {
-	dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
+	int r;
+	int flags = DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_INTERRUPTIBLE;
+	if (param->flags & DM_DEFERRED_REMOVE)
+		flags |= DM_REMOVE_MARK_DEFERRED;
+	r = dm_hash_remove_all(flags);
 	param->data_size = 0;
-	return 0;
+	return r;
 }
 
 /*
@@ -1341,6 +1356,10 @@ static void retrieve_status(struct dm_table *table,
 		used = param->data_start + (outptr - outbuf);
 
 		outptr = align_ptr(outptr);
+		if (!outptr || outptr > outbuf + len) {
+			param->flags |= DM_BUFFER_FULL_FLAG;
+			break;
+		}
 		spec->next = outptr - outbuf;
 	}
 
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 1aa6a4a7d232..d316757a328b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -373,7 +373,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 
 	struct log_c *lc;
 	uint32_t region_size;
-	unsigned int region_count;
+	sector_t region_count;
 	size_t bitset_size, buf_size;
 	int r;
 	char dummy;
@@ -401,6 +401,10 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	}
 
 	region_count = dm_sector_div_up(ti->len, region_size);
+	if (region_count > UINT_MAX) {
+		DMWARN("region count exceeds limit of %u", UINT_MAX);
+		return -EINVAL;
+	}
 
 	lc = kmalloc_obj(*lc);
 	if (!lc) {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 8f4ae2f51545..7cb7bb6233b6 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -102,7 +102,6 @@ struct multipath {
 	struct bio_list queued_bios;
 
 	struct timer_list nopath_timer;	/* Timeout for queue_if_no_path */
-	bool is_suspending;
 };
 
 /*
@@ -1749,9 +1748,6 @@ static void multipath_presuspend(struct dm_target *ti)
 {
 	struct multipath *m = ti->private;
 
-	spin_lock_irq(&m->lock);
-	m->is_suspending = true;
-	spin_unlock_irq(&m->lock);
 	/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
 	if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
 		queue_if_no_path(m, false, true, __func__);
@@ -1774,7 +1770,6 @@ static void multipath_resume(struct dm_target *ti)
 	struct multipath *m = ti->private;
 
 	spin_lock_irq(&m->lock);
-	m->is_suspending = false;
 	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
 		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 		clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
@@ -2098,7 +2093,7 @@ static int probe_active_paths(struct multipath *m)
 		if (m->current_pg == m->last_probed_pg)
 			goto skip_probe;
 	}
-	if (!m->current_pg || m->is_suspending ||
+	if (!m->current_pg || dm_suspended(m->ti) ||
 	    test_bit(MPATHF_QUEUE_IO, &m->flags))
 		goto skip_probe;
 	set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
@@ -2107,7 +2102,7 @@ static int probe_active_paths(struct multipath *m)
 
 	list_for_each_entry(pgpath, &pg->pgpaths, list) {
 		if (pg != READ_ONCE(m->current_pg) ||
-		    READ_ONCE(m->is_suspending))
+		    dm_suspended(m->ti))
 			goto out;
 		if (!pgpath->is_active)
 			continue;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 80a5c4127707..de5c00704e69 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -993,13 +993,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
 		return NULL;
 	}
 
-	*args_used = 2 + param_count;
-
-	if (argc < *args_used) {
+	if (param_count > argc - 2) {
 		ti->error = "Insufficient mirror log arguments";
 		return NULL;
 	}
 
+	*args_used = 2 + param_count;
+
 	dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
 				 argv + 2);
 	if (!dl) {
diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
index e3bba0b28aad..b8a3977b815d 100644
--- a/drivers/md/dm-vdo/action-manager.c
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -107,7 +107,7 @@ int vdo_make_action_manager(zone_count_t zones,
 			    struct action_manager **manager_ptr)
 {
 	struct action_manager *manager;
-	int result = vdo_allocate(1, struct action_manager, __func__, &manager);
+	int result = vdo_allocate(1, __func__, &manager);
 
 	if (result != VDO_SUCCESS)
 		return result;
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index a7db5b41155e..5ffc360540ed 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -221,8 +221,7 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
 	u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE;
 	int result;
 
-	result = vdo_allocate(cache->page_count, struct page_info, "page infos",
-			      &cache->infos);
+	result = vdo_allocate(cache->page_count, "page infos", &cache->infos);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2364,18 +2363,15 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages,
 
 	forest->segments = index + 1;
 
-	result = vdo_allocate(forest->segments, struct boundary,
-			      "forest boundary array", &forest->boundaries);
+	result = vdo_allocate(forest->segments, "forest boundary array", &forest->boundaries);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(forest->segments, struct tree_page *,
-			      "forest page pointers", &forest->pages);
+	result = vdo_allocate(forest->segments, "forest page pointers", &forest->pages);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(new_pages, struct tree_page,
-			      "new forest pages", &forest->pages[index]);
+	result = vdo_allocate(new_pages, "new forest pages", &forest->pages[index]);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2400,9 +2396,7 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages,
 		struct block_map_tree *tree = &(forest->trees[root]);
 		height_t height;
 
-		int result = vdo_allocate(forest->segments,
-					  struct block_map_tree_segment,
-					  "tree root segments", &tree->segments);
+		result = vdo_allocate(forest->segments, "tree root segments", &tree->segments);
 		if (result != VDO_SUCCESS)
 			return result;
 
@@ -2478,9 +2472,7 @@ static int make_forest(struct block_map *map, block_count_t entries)
 		return VDO_SUCCESS;
 	}
 
-	result = vdo_allocate_extended(struct forest, map->root_count,
-				       struct block_map_tree, __func__,
-				       &forest);
+	result = vdo_allocate_extended(map->root_count, trees, __func__, &forest);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2707,8 +2699,7 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
 	struct cursors *cursors;
 	int result;
 
-	result = vdo_allocate_extended(struct cursors, map->root_count,
-				       struct cursor, __func__, &cursors);
+	result = vdo_allocate_extended(map->root_count, cursors, __func__, &cursors);
 	if (result != VDO_SUCCESS) {
 		vdo_fail_completion(completion, result);
 		return;
@@ -2758,9 +2749,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map,
 	zone->thread_id = vdo->thread_config.logical_threads[zone_number];
 	zone->block_map = map;
 
-	result = vdo_allocate_extended(struct dirty_lists, maximum_age,
-				       dirty_era_t, __func__,
-				       &zone->dirty_lists);
+	result = vdo_allocate_extended(maximum_age, eras, __func__, &zone->dirty_lists);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2900,9 +2889,8 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate_extended(struct block_map,
-				       vdo->thread_config.logical_zone_count,
-				       struct block_map_zone, __func__, &map);
+	result = vdo_allocate_extended(vdo->thread_config.logical_zone_count,
+				       zones, __func__, &map);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h
index 39a13039e4a3..4fd24043b0d7 100644
--- a/drivers/md/dm-vdo/block-map.h
+++ b/drivers/md/dm-vdo/block-map.h
@@ -276,7 +276,7 @@ struct block_map {
 	block_count_t next_entry_count;
 
 	zone_count_t zone_count;
-	struct block_map_zone zones[];
+	struct block_map_zone zones[] __counted_by(zone_count);
 };
 
 /**
diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h
index 2a8b03779f87..b84e7edeb22e 100644
--- a/drivers/md/dm-vdo/constants.h
+++ b/drivers/md/dm-vdo/constants.h
@@ -44,6 +44,9 @@ enum {
 	/* The default size of each slab journal, in blocks */
 	DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
 
+	/* The recovery journal starting sequence number set at format time */
+	RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER = 1,
+
 	/*
 	 * The initial size of lbn_operations and pbn_operations, which is based upon the expected
 	 * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
@@ -57,8 +60,14 @@ enum {
 	/* The maximum number of physical zones */
 	MAX_VDO_PHYSICAL_ZONES = 16,
 
-	/* The base-2 logarithm of the maximum blocks in one slab */
-	MAX_VDO_SLAB_BITS = 23,
+	/* The default blocks in one slab */
+	DEFAULT_VDO_SLAB_BLOCKS = 1U << 19,
+
+	/* The minimum blocks in one slab */
+	MIN_VDO_SLAB_BLOCKS = 1U << 13,
+
+	/* The maximum blocks in one slab */
+	MAX_VDO_SLAB_BLOCKS = 1U << 23,
 
 	/* The maximum number of slabs the slab depot supports */
 	MAX_VDO_SLABS = 8192,
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 3333e1e5b02e..370d4239ba31 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -842,8 +842,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 	struct data_vio_pool *pool;
 	data_vio_count_t i;
 
-	result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio,
-				       __func__, &pool);
+	result = vdo_allocate_extended(pool_size, data_vios, __func__, &pool);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 75a26f3f4461..5f5639d89bc6 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -296,7 +296,7 @@ struct hash_zones {
 	/* The number of zones */
 	zone_count_t zone_count;
 	/* The hash zones themselves */
-	struct hash_zone zones[];
+	struct hash_zone zones[] __counted_by(zone_count);
 };
 
 /* These are in milliseconds. */
@@ -2364,8 +2364,7 @@ static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zone
 	vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback,
 				    zone->thread_id);
 	INIT_LIST_HEAD(&zone->lock_pool);
-	result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array",
-			      &zone->lock_array);
+	result = vdo_allocate(LOCK_POOL_CAPACITY, "hash_lock array", &zone->lock_array);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2418,8 +2417,7 @@ int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone,
-				       __func__, &zones);
+	result = vdo_allocate_extended(zone_count, zones, __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 6af40d40f255..1d8375cc3c3e 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/device-mapper.h>
 #include <linux/err.h>
+#include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
@@ -60,6 +61,11 @@ enum admin_phases {
 	LOAD_PHASE_DRAIN_JOURNAL,
 	LOAD_PHASE_WAIT_FOR_READ_ONLY,
 	PRE_LOAD_PHASE_START,
+	PRE_LOAD_PHASE_FORMAT_START,
+	PRE_LOAD_PHASE_FORMAT_SUPER,
+	PRE_LOAD_PHASE_FORMAT_GEOMETRY,
+	PRE_LOAD_PHASE_FORMAT_END,
+	PRE_LOAD_PHASE_LOAD_SUPER,
 	PRE_LOAD_PHASE_LOAD_COMPONENTS,
 	PRE_LOAD_PHASE_END,
 	PREPARE_GROW_PHYSICAL_PHASE_START,
@@ -109,6 +115,11 @@ static const char * const ADMIN_PHASE_NAMES[] = {
 	"LOAD_PHASE_DRAIN_JOURNAL",
 	"LOAD_PHASE_WAIT_FOR_READ_ONLY",
 	"PRE_LOAD_PHASE_START",
+	"PRE_LOAD_PHASE_FORMAT_START",
+	"PRE_LOAD_PHASE_FORMAT_SUPER",
+	"PRE_LOAD_PHASE_FORMAT_GEOMETRY",
+	"PRE_LOAD_PHASE_FORMAT_END",
+	"PRE_LOAD_PHASE_LOAD_SUPER",
 	"PRE_LOAD_PHASE_LOAD_COMPONENTS",
 	"PRE_LOAD_PHASE_END",
 	"PREPARE_GROW_PHYSICAL_PHASE_START",
@@ -273,8 +284,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 			substring_count++;
 	}
 
-	result = vdo_allocate(substring_count + 1, char *, "string-splitting array",
-			      &substrings);
+	result = vdo_allocate(substring_count + 1, "string-splitting array", &substrings);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -282,7 +292,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 		if (*s == separator) {
 			ptrdiff_t length = s - string;
 
-			result = vdo_allocate(length + 1, char, "split string",
+			result = vdo_allocate(length + 1, "split string",
 					      &substrings[current_substring]);
 			if (result != VDO_SUCCESS) {
 				free_string_array(substrings);
@@ -303,8 +313,7 @@ static int split_string(const char *string, char separator, char ***substring_ar
 	BUG_ON(current_substring != (substring_count - 1));
 	length = strlen(string);
 
-	result = vdo_allocate(length + 1, char, "split string",
-			      &substrings[current_substring]);
+	result = vdo_allocate(length + 1, "split string", &substrings[current_substring]);
 	if (result != VDO_SUCCESS) {
 		free_string_array(substrings);
 		return result;
@@ -332,7 +341,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa
 	for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++)
 		string_length += strlen(substring_array[i]) + 1;
 
-	result = vdo_allocate(string_length, char, __func__, &output);
+	result = vdo_allocate(string_length, __func__, &output);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -380,6 +389,75 @@ static inline int __must_check parse_bool(const char *bool_str, const char *true
 }
 
 /**
+ * parse_memory() - Parse a string into an index memory value.
+ * @memory_str: The string value to convert to a memory value.
+ * @memory_ptr: A pointer to return the memory value in.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+static int __must_check parse_memory(const char *memory_str,
+				     uds_memory_config_size_t *memory_ptr)
+{
+	uds_memory_config_size_t memory;
+
+	if (strcmp(memory_str, "0.25") == 0) {
+		memory = UDS_MEMORY_CONFIG_256MB;
+	} else if ((strcmp(memory_str, "0.5") == 0) || (strcmp(memory_str, "0.50") == 0)) {
+		memory = UDS_MEMORY_CONFIG_512MB;
+	} else if (strcmp(memory_str, "0.75") == 0) {
+		memory = UDS_MEMORY_CONFIG_768MB;
+	} else {
+		unsigned int value;
+		int result;
+
+		result = kstrtouint(memory_str, 10, &value);
+		if (result) {
+			vdo_log_error("optional parameter error: invalid memory size, must be a positive integer");
+			return -EINVAL;
+		}
+
+		if (value > UDS_MEMORY_CONFIG_MAX) {
+			vdo_log_error("optional parameter error: invalid memory size, must not be greater than %d",
+				      UDS_MEMORY_CONFIG_MAX);
+			return -EINVAL;
+		}
+
+		memory = value;
+	}
+
+	*memory_ptr = memory;
+	return VDO_SUCCESS;
+}
+
+/**
+ * parse_slab_size() - Parse a string option into a slab size value.
+ * @slab_str: The string value representing slab size.
+ * @slab_size_ptr: A pointer to return the slab size in.
+ *
+ * Return: VDO_SUCCESS or an error
+ */
+static int __must_check parse_slab_size(const char *slab_str, block_count_t *slab_size_ptr)
+{
+	block_count_t value;
+	int result;
+
+	result = kstrtoull(slab_str, 10, &value);
+	if (result) {
+		vdo_log_error("optional parameter error: invalid slab size, must be a postive integer");
+		return -EINVAL;
+	}
+
+	if (value < MIN_VDO_SLAB_BLOCKS || value > MAX_VDO_SLAB_BLOCKS || (!is_power_of_2(value))) {
+		vdo_log_error("optional parameter error: invalid slab size, must be a power of two between %u and %u",
+			      MIN_VDO_SLAB_BLOCKS, MAX_VDO_SLAB_BLOCKS);
+		return -EINVAL;
+	}
+
+	*slab_size_ptr = value;
+	return VDO_SUCCESS;
+}
+
+/**
  * process_one_thread_config_spec() - Process one component of a thread parameter configuration
  *				      string and update the configuration data structure.
  * @thread_param_type: The type of thread specified.
@@ -568,7 +646,7 @@ static int process_one_key_value_pair(const char *key, unsigned int value,
 		}
 		/* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */
 		if (value > (UINT_MAX / VDO_BLOCK_SIZE)) {
-			vdo_log_error("optional parameter error: at most %d max discard	 blocks are allowed",
+			vdo_log_error("optional parameter error: at most %d max discard blocks are allowed",
 				      UINT_MAX / VDO_BLOCK_SIZE);
 			return -EINVAL;
 		}
@@ -600,7 +678,16 @@ static int parse_one_key_value_pair(const char *key, const char *value,
 	if (strcmp(key, "compression") == 0)
 		return parse_bool(value, "on", "off", &config->compression);
 
-	/* The remaining arguments must have integral values. */
+	if (strcmp(key, "indexSparse") == 0)
+		return parse_bool(value, "on", "off", &config->index_sparse);
+
+	if (strcmp(key, "indexMemory") == 0)
+		return parse_memory(value, &config->index_memory);
+
+	if (strcmp(key, "slabSize") == 0)
+		return parse_slab_size(value, &config->slab_blocks);
+
+	/* The remaining arguments must have non-negative integral values. */
 	result = kstrtouint(value, 10, &count);
 	if (result) {
 		vdo_log_error("optional config string error: integer value needed, found \"%s\"",
@@ -715,6 +802,12 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 	struct device_config *config = NULL;
 	int result;
 
+	if (logical_bytes > (MAXIMUM_VDO_LOGICAL_BLOCKS * VDO_BLOCK_SIZE)) {
+		handle_parse_error(config, error_ptr,
+				   "Logical size exceeds the maximum");
+		return VDO_BAD_CONFIGURATION;
+	}
+
 	if ((logical_bytes % VDO_BLOCK_SIZE) != 0) {
 		handle_parse_error(config, error_ptr,
 				   "Logical size must be a multiple of 4096");
@@ -726,7 +819,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 		return VDO_BAD_CONFIGURATION;
 	}
 
-	result = vdo_allocate(1, struct device_config, "device_config", &config);
+	result = vdo_allocate(1, "device_config", &config);
 	if (result != VDO_SUCCESS) {
 		handle_parse_error(config, error_ptr,
 				   "Could not allocate config structure");
@@ -758,6 +851,9 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 	config->max_discard_blocks = 1;
 	config->deduplication = true;
 	config->compression = false;
+	config->index_memory = UDS_MEMORY_CONFIG_256MB;
+	config->index_sparse = false;
+	config->slab_blocks = DEFAULT_VDO_SLAB_BLOCKS;
 
 	arg_set.argc = argc;
 	arg_set.argv = argv;
@@ -783,7 +879,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 	/* Get the physical blocks, if known. */
 	if (config->version >= 1) {
 		result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks);
-		if (result != VDO_SUCCESS) {
+		if (result) {
 			handle_parse_error(config, error_ptr,
 					   "Invalid physical block count");
 			return VDO_BAD_CONFIGURATION;
@@ -804,7 +900,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 
 	/* Get the page cache size. */
 	result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size);
-	if (result != VDO_SUCCESS) {
+	if (result) {
 		handle_parse_error(config, error_ptr,
 				   "Invalid block map page cache size");
 		return VDO_BAD_CONFIGURATION;
@@ -812,7 +908,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti,
 
 	/* Get the block map era length. */
 	result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age);
-	if (result != VDO_SUCCESS) {
+	if (result) {
 		handle_parse_error(config, error_ptr, "Invalid block map maximum age");
 		return VDO_BAD_CONFIGURATION;
 	}
@@ -1401,7 +1497,33 @@ static void pre_load_callback(struct vdo_completion *completion)
 			vdo_continue_completion(completion, result);
 			return;
 		}
+		if (vdo->needs_formatting)
+			vdo->admin.phase = PRE_LOAD_PHASE_FORMAT_START;
+		else
+			vdo->admin.phase = PRE_LOAD_PHASE_LOAD_SUPER;
+
+		vdo_continue_completion(completion, VDO_SUCCESS);
+		return;
+
+	case PRE_LOAD_PHASE_FORMAT_START:
+		vdo_continue_completion(completion, vdo_clear_layout(vdo));
+		return;
+
+	case PRE_LOAD_PHASE_FORMAT_SUPER:
+		vdo_save_super_block(vdo, completion);
+		return;
+
+	case PRE_LOAD_PHASE_FORMAT_GEOMETRY:
+		vdo_save_geometry_block(vdo, completion);
+		return;
+
+	case PRE_LOAD_PHASE_FORMAT_END:
+		/* cleanup layout before load adds to it */
+		vdo_uninitialize_layout(&vdo->states.layout);
+		vdo_continue_completion(completion, VDO_SUCCESS);
+		return;
 
+	case PRE_LOAD_PHASE_LOAD_SUPER:
 		vdo_load_super_block(vdo, completion);
 		return;
 
@@ -1459,10 +1581,13 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance,
 	vdo_log_debug("Logical blocks         = %llu", logical_blocks);
 	vdo_log_debug("Physical block size    = %llu", (u64) block_size);
 	vdo_log_debug("Physical blocks        = %llu", config->physical_blocks);
+	vdo_log_debug("Slab size              = %llu", config->slab_blocks);
 	vdo_log_debug("Block map cache blocks = %u", config->cache_size);
 	vdo_log_debug("Block map maximum age  = %u", config->block_map_maximum_age);
 	vdo_log_debug("Deduplication          = %s", (config->deduplication ? "on" : "off"));
 	vdo_log_debug("Compression            = %s", (config->compression ? "on" : "off"));
+	vdo_log_debug("Index memory           = %u", config->index_memory);
+	vdo_log_debug("Index sparse           = %s", (config->index_sparse ? "on" : "off"));
 
 	vdo = vdo_find_matching(vdo_uses_device, config);
 	if (vdo != NULL) {
@@ -2858,7 +2983,7 @@ static void vdo_resume(struct dm_target *ti)
 static struct target_type vdo_target_bio = {
 	.features = DM_TARGET_SINGLETON,
 	.name = "vdo",
-	.version = { 9, 1, 0 },
+	.version = { 9, 2, 0 },
 	.module = THIS_MODULE,
 	.ctr = vdo_ctr,
 	.dtr = vdo_dtr,
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index bd60f4b3a0d0..d75e023df637 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -12,15 +12,10 @@
 #include "permassert.h"
 
 #include "constants.h"
+#include "indexer.h"
 #include "status-codes.h"
 #include "types.h"
 
-/** The maximum logical space is 4 petabytes, which is 1 terablock. */
-static const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024;
-
-/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
-static const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64;
-
 struct geometry_block {
 	char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE];
 	struct packed_header header;
@@ -293,6 +288,62 @@ static void decode_volume_geometry(u8 *buffer, size_t *offset,
 }
 
 /**
+ * vdo_encode_volume_geometry() - Encode the on-disk representation of a volume geometry into a buffer.
+ * @buffer: A buffer to store the encoding.
+ * @geometry: The geometry to encode.
+ * @version: The geometry block version to encode.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry,
+			       u32 version)
+{
+	int result;
+	enum volume_region_id id;
+	u32 checksum;
+	size_t offset = 0;
+	const struct header *header;
+
+	memcpy(buffer, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE);
+	offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE;
+
+	header = (version > 4) ? &GEOMETRY_BLOCK_HEADER_5_0 : &GEOMETRY_BLOCK_HEADER_4_0;
+	vdo_encode_header(buffer, &offset, header);
+
+	/* This is for backwards compatibility */
+	encode_u32_le(buffer, &offset, geometry->unused);
+	encode_u64_le(buffer, &offset, geometry->nonce);
+	memcpy(buffer + offset, (unsigned char *) &geometry->uuid, sizeof(uuid_t));
+	offset += sizeof(uuid_t);
+
+	if (version > 4)
+		encode_u64_le(buffer, &offset, geometry->bio_offset);
+
+	for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) {
+		encode_u32_le(buffer, &offset, geometry->regions[id].id);
+		encode_u64_le(buffer, &offset, geometry->regions[id].start_block);
+	}
+
+	encode_u32_le(buffer, &offset, geometry->index_config.mem);
+	encode_u32_le(buffer, &offset, 0);
+
+	if (geometry->index_config.sparse)
+		buffer[offset++] = 1;
+	else
+		buffer[offset++] = 0;
+
+	result = VDO_ASSERT(header->size == offset + sizeof(u32),
+			    "should have encoded up to the geometry checksum");
+	if (result != VDO_SUCCESS)
+		return result;
+
+	checksum = vdo_crc32(buffer, offset);
+	encode_u32_le(buffer, &offset, checksum);
+
+	return VDO_SUCCESS;
+}
+
+/**
  * vdo_parse_geometry_block() - Decode and validate an encoded geometry block.
  * @block: The encoded geometry block.
  * @geometry: The structure to receive the decoded fields.
@@ -798,7 +849,7 @@ static int allocate_partition(struct layout *layout, u8 id,
 	struct partition *partition;
 	int result;
 
-	result = vdo_allocate(1, struct partition, __func__, &partition);
+	result = vdo_allocate(1, __func__, &partition);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1219,9 +1270,9 @@ int vdo_validate_config(const struct vdo_config *config,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS),
-			    "slab size must be less than or equal to 2^%d",
-			    MAX_VDO_SLAB_BITS);
+	result = VDO_ASSERT(config->slab_size <= MAX_VDO_SLAB_BLOCKS,
+			    "slab size must be a power of two less than or equal to %d",
+			    MAX_VDO_SLAB_BLOCKS);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1486,3 +1537,153 @@ int vdo_decode_super_block(u8 *buffer)
 
 	return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS);
 }
+
+/**
+ * vdo_initialize_component_states() - Initialize the components so they can be written out.
+ * @vdo_config: The config used for component state initialization.
+ * @geometry: The volume geometry used to calculate the data region offset.
+ * @nonce: The nonce to use to identify the vdo.
+ * @states: The component states to initialize.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_initialize_component_states(const struct vdo_config *vdo_config,
+				    const struct volume_geometry *geometry,
+				    nonce_t nonce,
+				    struct vdo_component_states *states)
+{
+	int result;
+	struct slab_config slab_config;
+	struct partition *partition;
+
+	states->vdo.config = *vdo_config;
+	states->vdo.nonce = nonce;
+	states->volume_version = VDO_VOLUME_VERSION_67_0;
+
+	states->recovery_journal = (struct recovery_journal_state_7_0) {
+		.journal_start = RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER,
+		.logical_blocks_used = 0,
+		.block_map_data_blocks = 0,
+	};
+
+	/*
+	 * The layout starts 1 block past the beginning of the data region, as the
+	 * data region contains the super block but the layout does not.
+	 */
+	result = vdo_initialize_layout(vdo_config->physical_blocks,
+				       vdo_get_data_region_start(*geometry) + 1,
+				       DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT,
+				       vdo_config->recovery_journal_size,
+				       VDO_SLAB_SUMMARY_BLOCKS,
+				       &states->layout);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = vdo_configure_slab(vdo_config->slab_size,
+				    vdo_config->slab_journal_blocks,
+				    &slab_config);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(&states->layout);
+		return result;
+	}
+
+	result = vdo_get_partition(&states->layout, VDO_SLAB_DEPOT_PARTITION,
+				   &partition);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(&states->layout);
+		return result;
+	}
+
+	result = vdo_configure_slab_depot(partition, slab_config, 0,
+					  &states->slab_depot);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(&states->layout);
+		return result;
+	}
+
+	result = vdo_get_partition(&states->layout, VDO_BLOCK_MAP_PARTITION,
+				   &partition);
+	if (result != VDO_SUCCESS) {
+		vdo_uninitialize_layout(&states->layout);
+		return result;
+	}
+
+	states->block_map = (struct block_map_state_2_0) {
+		.flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN,
+		.flat_page_count = 0,
+		.root_origin = partition->offset,
+		.root_count = DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT,
+	};
+
+	states->vdo.state = VDO_NEW;
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_compute_index_blocks() - Compute the number of blocks that the indexer will use.
+ * @config: The index config from which the blocks are calculated.
+ * @index_blocks_ptr: The number of blocks the index will use.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int vdo_compute_index_blocks(const struct index_config *config,
+				    block_count_t *index_blocks_ptr)
+{
+	int result;
+	u64 index_bytes;
+	struct uds_parameters uds_parameters = {
+		.memory_size = config->mem,
+		.sparse = config->sparse,
+	};
+
+	result = uds_compute_index_size(&uds_parameters, &index_bytes);
+	if (result != UDS_SUCCESS)
+		return vdo_log_error_strerror(result, "error computing index size");
+
+	*index_blocks_ptr = index_bytes / VDO_BLOCK_SIZE;
+	return VDO_SUCCESS;
+}
+
+/**
+ * vdo_initialize_volume_geometry() - Initialize the volume geometry so it can be written out.
+ * @nonce: The nonce to use to identify the vdo.
+ * @uuid: The uuid to use to identify the vdo.
+ * @index_config: The config used for structure initialization.
+ * @geometry: The volume geometry to initialize.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid,
+				   const struct index_config *index_config,
+				   struct volume_geometry *geometry)
+{
+	int result;
+	block_count_t index_blocks = 0;
+
+	result = vdo_compute_index_blocks(index_config, &index_blocks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*geometry = (struct volume_geometry) {
+		/* This is for backwards compatibility. */
+		.unused = 0,
+		.nonce = nonce,
+		.bio_offset = 0,
+		.regions = {
+			[VDO_INDEX_REGION] = {
+				.id = VDO_INDEX_REGION,
+				.start_block = 1,
+			},
+			[VDO_DATA_REGION] = {
+				.id = VDO_DATA_REGION,
+				.start_block = 1 + index_blocks,
+			}
+		}
+	};
+
+	memcpy(&(geometry->uuid), uuid, sizeof(uuid_t));
+	memcpy(&geometry->index_config, index_config, sizeof(struct index_config));
+
+	return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h
index 87b7d2f3b545..67ff0ff2ffda 100644
--- a/drivers/md/dm-vdo/encodings.h
+++ b/drivers/md/dm-vdo/encodings.h
@@ -608,6 +608,12 @@ struct vdo_config {
 	block_count_t slab_journal_blocks; /* number of slab journal blocks */
 };
 
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+#define MAXIMUM_VDO_LOGICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 1024))
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+#define MAXIMUM_VDO_PHYSICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 64))
+
 /* This is the structure that captures the vdo fields saved as a super block component. */
 struct vdo_component {
 	enum vdo_state state;
@@ -803,6 +809,12 @@ vdo_get_index_region_size(struct volume_geometry geometry)
 		vdo_get_index_region_start(geometry);
 }
 
+int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid,
+				   const struct index_config *index_config,
+				   struct volume_geometry *geometry);
+
+int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry,
+			       u32 version);
 int __must_check vdo_parse_geometry_block(unsigned char *block,
 					  struct volume_geometry *geometry);
 
@@ -1264,6 +1276,11 @@ int __must_check vdo_validate_component_states(struct vdo_component_states *stat
 void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states);
 int __must_check vdo_decode_super_block(u8 *buffer);
 
+int vdo_initialize_component_states(const struct vdo_config *vdo_config,
+				    const struct volume_geometry *geometry,
+				    nonce_t nonce,
+				    struct vdo_component_states *states);
+
 /* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */
 static inline u32 vdo_crc32(const void *buf, unsigned long len)
 {
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 82a259ef1601..6c1610ba91b6 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -105,7 +105,7 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data)
 	if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) {
 		flush = vdo_allocate_memory_nowait(sizeof(struct vdo_flush), __func__);
 	} else {
-		int result = vdo_allocate(1, struct vdo_flush, __func__, &flush);
+		int result = vdo_allocate(1, __func__, &flush);
 
 		if (result != VDO_SUCCESS)
 			vdo_log_error_strerror(result, "failed to allocate spare flush");
@@ -134,7 +134,7 @@ static void free_flush(void *element, void *pool_data __always_unused)
  */
 int vdo_make_flusher(struct vdo *vdo)
 {
-	int result = vdo_allocate(1, struct flusher, __func__, &vdo->flusher);
+	int result = vdo_allocate(1, __func__, &vdo->flusher);
 
 	if (result != VDO_SUCCESS)
 		return result;
diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c
index a63b2f2bfd7d..7011963c9073 100644
--- a/drivers/md/dm-vdo/funnel-queue.c
+++ b/drivers/md/dm-vdo/funnel-queue.c
@@ -14,7 +14,7 @@ int vdo_make_funnel_queue(struct funnel_queue **queue_ptr)
 	int result;
 	struct funnel_queue *queue;
 
-	result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue);
+	result = vdo_allocate(1, "funnel queue", &queue);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index 8a79b33b8b09..62d300f70de9 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -322,7 +322,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 			    "queue priority count %u within limit %u", type->max_priority,
 			    VDO_WORK_Q_MAX_PRIORITY);
 
-	result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue);
+	result = vdo_allocate(1, "simple work queue", &queue);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -405,13 +405,11 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name,
 		return result;
 	}
 
-	result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue",
-			      &queue);
+	result = vdo_allocate(1, "round-robin work queue", &queue);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(thread_count, struct simple_work_queue *,
-			      "subordinate work queues", &queue->service_queues);
+	result = vdo_allocate(thread_count, "subordinate work queues", &queue->service_queues);
 	if (result != VDO_SUCCESS) {
 		vdo_free(queue);
 		return result;
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
index fb1db41c794b..bb3b0ab5d50d 100644
--- a/drivers/md/dm-vdo/indexer/chapter-index.c
+++ b/drivers/md/dm-vdo/indexer/chapter-index.c
@@ -20,7 +20,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
 	size_t memory_size;
 	struct open_chapter_index *index;
 
-	result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index);
+	result = vdo_allocate(1, "open chapter index", &index);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
index 5532371b952f..4a2cc66cfd60 100644
--- a/drivers/md/dm-vdo/indexer/config.c
+++ b/drivers/md/dm-vdo/indexer/config.c
@@ -325,7 +325,7 @@ int uds_make_configuration(const struct uds_parameters *params,
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = vdo_allocate(1, struct uds_configuration, __func__, &config);
+	result = vdo_allocate(1, __func__, &config);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
index 0ac2443f0df3..b288749067de 100644
--- a/drivers/md/dm-vdo/indexer/delta-index.c
+++ b/drivers/md/dm-vdo/indexer/delta-index.c
@@ -311,18 +311,16 @@ static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
 {
 	int result;
 
-	result = vdo_allocate(size, u8, "delta list", &delta_zone->memory);
+	result = vdo_allocate(size, "delta list", &delta_zone->memory);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(list_count + 2, u64, "delta list temp",
-			      &delta_zone->new_offsets);
+	result = vdo_allocate(list_count + 2, "delta list temp", &delta_zone->new_offsets);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	/* Allocate the delta lists. */
-	result = vdo_allocate(list_count + 2, struct delta_list, "delta lists",
-			      &delta_zone->delta_lists);
+	result = vdo_allocate(list_count + 2, "delta lists", &delta_zone->delta_lists);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -352,8 +350,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon
 	unsigned int z;
 	size_t zone_memory;
 
-	result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones",
-			      &delta_index->delta_zones);
+	result = vdo_allocate(zone_count, "Delta Index Zones", &delta_index->delta_zones);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1047,7 +1044,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index,
 	unsigned int z;
 	u8 *data;
 
-	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
+	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, __func__, &data);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
index 1a5735375ddc..03797cf87b91 100644
--- a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
+++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
@@ -198,7 +198,7 @@ int uds_make_request_queue(const char *queue_name,
 	int result;
 	struct uds_request_queue *queue;
 
-	result = vdo_allocate(1, struct uds_request_queue, __func__, &queue);
+	result = vdo_allocate(1, __func__, &queue);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
index c0575612e820..49f122a223d5 100644
--- a/drivers/md/dm-vdo/indexer/geometry.c
+++ b/drivers/md/dm-vdo/indexer/geometry.c
@@ -61,7 +61,7 @@ int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
 	int result;
 	struct index_geometry *geometry;
 
-	result = vdo_allocate(1, struct index_geometry, "geometry", &geometry);
+	result = vdo_allocate(1, "geometry", &geometry);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index 61edf2b72427..5f4ce4ab1b1e 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -249,6 +249,32 @@ static int __must_check compute_sizes(const struct uds_configuration *config,
 	return UDS_SUCCESS;
 }
 
+int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size)
+{
+	int result;
+	struct uds_configuration *index_config;
+	struct save_layout_sizes sizes;
+
+	if (index_size == NULL) {
+		vdo_log_error("Missing output size pointer");
+		return -EINVAL;
+	}
+
+	result = uds_make_configuration(parameters, &index_config);
+	if (result != UDS_SUCCESS) {
+		vdo_log_error_strerror(result, "cannot compute index size");
+		return result;
+	}
+
+	result = compute_sizes(index_config, &sizes);
+	uds_free_configuration(index_config);
+	if (result != UDS_SUCCESS)
+		return result;
+
+	*index_size = sizes.total_size;
+	return UDS_SUCCESS;
+}
+
 /* Create unique data using the current time and a pseudorandom number. */
 static void create_unique_nonce_data(u8 *buffer)
 {
@@ -459,8 +485,7 @@ static int __must_check make_index_save_region_table(struct index_save_layout *i
 		type = RH_TYPE_UNSAVED;
 	}
 
-	result = vdo_allocate_extended(struct region_table, region_count,
-				       struct layout_region,
+	result = vdo_allocate_extended(region_count, regions,
 				       "layout region table for ISL", &table);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -520,7 +545,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer);
+	result = vdo_allocate(table->encoded_size, "index save data", &buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -642,9 +667,8 @@ static int __must_check make_layout_region_table(struct index_layout *layout,
 	struct region_table *table;
 	struct layout_region *lr;
 
-	result = vdo_allocate_extended(struct region_table, region_count,
-				       struct layout_region, "layout region table",
-				       &table);
+	result = vdo_allocate_extended(region_count, regions,
+				       "layout region table", &table);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -690,7 +714,7 @@ static int __must_check write_layout_header(struct index_layout *layout,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer);
+	result = vdo_allocate(table->encoded_size, "layout data", &buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -780,8 +804,7 @@ static int create_index_layout(struct index_layout *layout, struct uds_configura
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__,
-			      &layout->index.saves);
+	result = vdo_allocate(sizes.save_count, __func__, &layout->index.saves);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1138,8 +1161,7 @@ static int __must_check load_region_table(struct buffered_reader *reader,
 					      header.version);
 	}
 
-	result = vdo_allocate_extended(struct region_table, header.region_count,
-				       struct layout_region,
+	result = vdo_allocate_extended(header.region_count, regions,
 				       "single file layout region table", &table);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -1177,7 +1199,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader,
 	u8 *buffer;
 	size_t offset = 0;
 
-	result = vdo_allocate(saved_size, u8, "super block data", &buffer);
+	result = vdo_allocate(saved_size, "super block data", &buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1311,8 +1333,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout,
 	int result;
 	u64 next_block = first_block;
 
-	result = vdo_allocate(layout->super.max_saves, struct index_save_layout,
-			      __func__, &layout->index.saves);
+	result = vdo_allocate(layout->super.max_saves, __func__, &layout->index.saves);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1445,6 +1466,9 @@ static int __must_check reconstruct_index_save(struct index_save_layout *isl,
 	u64 last_block = next_block + isl->index_save.block_count;
 
 	isl->zone_count = table->header.region_count - 3;
+	if (isl->zone_count > MAX_ZONES)
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+					      "invalid zone count");
 
 	last_region = &table->regions[table->header.region_count - 1];
 	if (last_region->kind == RL_KIND_EMPTY) {
@@ -1672,7 +1696,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = vdo_allocate(1, struct index_layout, __func__, &layout);
+	result = vdo_allocate(1, __func__, &layout);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
index 00b44e07d0c1..1d45d466d07f 100644
--- a/drivers/md/dm-vdo/indexer/index-page-map.c
+++ b/drivers/md/dm-vdo/indexer/index-page-map.c
@@ -38,13 +38,13 @@ int uds_make_index_page_map(const struct index_geometry *geometry,
 	int result;
 	struct index_page_map *map;
 
-	result = vdo_allocate(1, struct index_page_map, "page map", &map);
+	result = vdo_allocate(1, "page map", &map);
 	if (result != VDO_SUCCESS)
 		return result;
 
 	map->geometry = geometry;
 	map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
-	result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
+	result = vdo_allocate(get_entry_count(geometry), "Index Page Map Entries",
 			      &map->entries);
 	if (result != VDO_SUCCESS) {
 		uds_free_index_page_map(map);
@@ -118,7 +118,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer
 	u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
 	u32 i;
 
-	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
+	result = vdo_allocate(saved_size, "page map data", &buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -145,7 +145,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *
 	u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
 	u32 i;
 
-	result = vdo_allocate(saved_size, u8, "page map data", &buffer);
+	result = vdo_allocate(saved_size, "page map data", &buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
index aa575a24e0b2..6c78070e1a05 100644
--- a/drivers/md/dm-vdo/indexer/index-session.c
+++ b/drivers/md/dm-vdo/indexer/index-session.c
@@ -217,7 +217,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde
 	int result;
 	struct uds_index_session *session;
 
-	result = vdo_allocate(1, struct uds_index_session, __func__, &session);
+	result = vdo_allocate(1, __func__, &session);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
index df4934846244..793bd32c1179 100644
--- a/drivers/md/dm-vdo/indexer/index.c
+++ b/drivers/md/dm-vdo/indexer/index.c
@@ -88,7 +88,7 @@ static int launch_zone_message(struct uds_zone_message message, unsigned int zon
 	int result;
 	struct uds_request *request;
 
-	result = vdo_allocate(1, struct uds_request, __func__, &request);
+	result = vdo_allocate(1, __func__, &request);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -764,9 +764,7 @@ static int make_chapter_writer(struct uds_index *index,
 	size_t collated_records_size =
 		(sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter);
 
-	result = vdo_allocate_extended(struct chapter_writer, index->zone_count,
-				       struct open_chapter_zone *, "Chapter Writer",
-				       &writer);
+	result = vdo_allocate_extended(index->zone_count, chapters, "Chapter Writer", &writer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1123,7 +1121,7 @@ static int make_index_zone(struct uds_index *index, unsigned int zone_number)
 	int result;
 	struct index_zone *zone;
 
-	result = vdo_allocate(1, struct index_zone, "index zone", &zone);
+	result = vdo_allocate(1, "index zone", &zone);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1160,8 +1158,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 	u64 nonce;
 	unsigned int z;
 
-	result = vdo_allocate_extended(struct uds_index, config->zone_count,
-				       struct uds_request_queue *, "index", &index);
+	result = vdo_allocate_extended(config->zone_count, zone_queues, "index", &index);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1173,8 +1170,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op
 		return result;
 	}
 
-	result = vdo_allocate(index->zone_count, struct index_zone *, "zones",
-			      &index->zones);
+	result = vdo_allocate(index->zone_count, "zones", &index->zones);
 	if (result != VDO_SUCCESS) {
 		uds_free_index(index);
 		return result;
diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h
index edabb239548e..1891f2de508e 100644
--- a/drivers/md/dm-vdo/indexer/index.h
+++ b/drivers/md/dm-vdo/indexer/index.h
@@ -53,7 +53,7 @@ struct uds_index {
 
 	index_callback_fn callback;
 	struct uds_request_queue *triage_queue;
-	struct uds_request_queue *zone_queues[];
+	struct uds_request_queue *zone_queues[] __counted_by(zone_count);
 };
 
 enum request_stage {
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
index 7c1fc4577f5b..d765f24328eb 100644
--- a/drivers/md/dm-vdo/indexer/indexer.h
+++ b/drivers/md/dm-vdo/indexer/indexer.h
@@ -282,6 +282,10 @@ struct uds_request {
 		     );
 };
 
+/* Compute the number of bytes needed to store an index. */
+int __must_check uds_compute_index_size(const struct uds_parameters *parameters,
+					u64 *index_size);
+
 /* A session is required for most index operations. */
 int __must_check uds_create_index_session(struct uds_index_session **session);
 
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
index 1bee9d63dc0a..f42861372030 100644
--- a/drivers/md/dm-vdo/indexer/io-factory.c
+++ b/drivers/md/dm-vdo/indexer/io-factory.c
@@ -64,7 +64,7 @@ int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_p
 	int result;
 	struct io_factory *factory;
 
-	result = vdo_allocate(1, struct io_factory, __func__, &factory);
+	result = vdo_allocate(1, __func__, &factory);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -144,7 +144,7 @@ int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader);
+	result = vdo_allocate(1, "buffered reader", &reader);
 	if (result != VDO_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
@@ -282,7 +282,7 @@ int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block
 	if (result != UDS_SUCCESS)
 		return result;
 
-	result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer);
+	result = vdo_allocate(1, "buffered writer", &writer);
 	if (result != VDO_SUCCESS) {
 		dm_bufio_client_destroy(client);
 		return result;
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
index 4a67bcadaae0..89b91c600bfd 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.c
+++ b/drivers/md/dm-vdo/indexer/open-chapter.c
@@ -68,9 +68,7 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo
 	size_t capacity = geometry->records_per_chapter / zone_count;
 	size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO));
 
-	result = vdo_allocate_extended(struct open_chapter_zone, slot_count,
-				       struct open_chapter_zone_slot, "open chapter",
-				       &open_chapter);
+	result = vdo_allocate_extended(slot_count, slots, "open chapter", &open_chapter);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h
index a4250bb19525..ea6d7336aea0 100644
--- a/drivers/md/dm-vdo/indexer/open-chapter.h
+++ b/drivers/md/dm-vdo/indexer/open-chapter.h
@@ -40,7 +40,7 @@ struct open_chapter_zone {
 	/* The number of slots in the hash table */
 	unsigned int slot_count;
 	/* The hash table slots, referencing virtual record numbers */
-	struct open_chapter_zone_slot slots[];
+	struct open_chapter_zone_slot slots[] __counted_by(slot_count);
 };
 
 int __must_check uds_make_open_chapter(const struct index_geometry *geometry,
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
index 66b8c706a1ef..4b81e130d18a 100644
--- a/drivers/md/dm-vdo/indexer/radix-sort.c
+++ b/drivers/md/dm-vdo/indexer/radix-sort.c
@@ -211,8 +211,7 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
 	unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
 	struct radix_sorter *radix_sorter;
 
-	result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task,
-				       __func__, &radix_sorter);
+	result = vdo_allocate_extended(stack_size, stack, __func__, &radix_sorter);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
index 28920167827c..eb62d3f01834 100644
--- a/drivers/md/dm-vdo/indexer/sparse-cache.c
+++ b/drivers/md/dm-vdo/indexer/sparse-cache.c
@@ -222,13 +222,12 @@ static int __must_check initialize_cached_chapter_index(struct cached_chapter_in
 	chapter->virtual_chapter = NO_CHAPTER;
 	chapter->index_pages_count = geometry->index_pages_per_chapter;
 
-	result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
-			      __func__, &chapter->index_pages);
+	result = vdo_allocate(chapter->index_pages_count, __func__, &chapter->index_pages);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
-			    "sparse index volume pages", &chapter->page_buffers);
+	return vdo_allocate(chapter->index_pages_count, "sparse index volume pages",
+			    &chapter->page_buffers);
 }
 
 static int __must_check make_search_list(struct sparse_cache *cache,
@@ -294,8 +293,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca
 	}
 
 	/* purge_search_list() needs some temporary lists for sorting. */
-	result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
-			      "scratch entries", &cache->scratch_entries);
+	result = vdo_allocate(capacity * 2, "scratch entries", &cache->scratch_entries);
 	if (result != VDO_SUCCESS)
 		goto out;
 
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index afb062e1f1fb..e78d2725ce8b 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -1211,13 +1211,12 @@ static int initialize_volume_sub_index(const struct uds_configuration *config,
 				  (zone_count * sizeof(struct volume_sub_index_zone)));
 
 	/* The following arrays are initialized to all zeros. */
-	result = vdo_allocate(params.list_count, u64, "first chapter to flush",
+	result = vdo_allocate(params.list_count, "first chapter to flush",
 			      &sub_index->flush_chapters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	return vdo_allocate(zone_count, struct volume_sub_index_zone,
-			    "volume index zones", &sub_index->zones);
+	return vdo_allocate(zone_count, "volume index zones", &sub_index->zones);
 }
 
 int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce,
@@ -1228,7 +1227,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 	struct volume_index *volume_index;
 	int result;
 
-	result = vdo_allocate(1, struct volume_index, "volume index", &volume_index);
+	result = vdo_allocate(1, "volume index", &volume_index);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1249,8 +1248,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non
 
 	volume_index->sparse_sample_rate = config->sparse_sample_rate;
 
-	result = vdo_allocate(config->zone_count, struct volume_index_zone,
-			      "volume index zones", &volume_index->zones);
+	result = vdo_allocate(config->zone_count, "volume index zones", &volume_index->zones);
 	if (result != VDO_SUCCESS) {
 		uds_free_volume_index(volume_index);
 		return result;
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 425b3a74f4db..af97c0cbeede 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -1509,23 +1509,21 @@ static int __must_check initialize_page_cache(struct page_cache *cache,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
-			      "volume read queue", &cache->read_queue);
+	result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, "volume read queue",
+			      &cache->read_queue);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(cache->zone_count, struct search_pending_counter,
-			      "Volume Cache Zones", &cache->search_pending_counters);
+	result = vdo_allocate(cache->zone_count, "Volume Cache Zones",
+			      &cache->search_pending_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(cache->indexable_pages, u16, "page cache index",
-			      &cache->index);
+	result = vdo_allocate(cache->indexable_pages, "page cache index", &cache->index);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache",
-			      &cache->cache);
+	result = vdo_allocate(cache->cache_slots, "page cache cache", &cache->cache);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1548,7 +1546,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	unsigned int reserved_buffers;
 	int result;
 
-	result = vdo_allocate(1, struct volume, "volume", &volume);
+	result = vdo_allocate(1, "volume", &volume);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1585,8 +1583,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 		return result;
 	}
 
-	result = vdo_allocate(geometry->records_per_page,
-			      const struct uds_volume_record *, "record pointers",
+	result = vdo_allocate(geometry->records_per_page, "record pointers",
 			      &volume->record_pointers);
 	if (result != VDO_SUCCESS) {
 		uds_free_volume(volume);
@@ -1626,8 +1623,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout
 	uds_init_cond(&volume->read_threads_read_done_cond);
 	uds_init_cond(&volume->read_threads_cond);
 
-	result = vdo_allocate(config->read_threads, struct thread *, "reader threads",
-			      &volume->reader_threads);
+	result = vdo_allocate(config->read_threads, "reader threads", &volume->reader_threads);
 	if (result != VDO_SUCCESS) {
 		uds_free_volume(volume);
 		return result;
diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c
index aeb690415dbd..28d8af1f9be2 100644
--- a/drivers/md/dm-vdo/int-map.c
+++ b/drivers/md/dm-vdo/int-map.c
@@ -164,8 +164,7 @@ static int allocate_buckets(struct int_map *map, size_t capacity)
 	 * without have to wrap back around to element zero.
 	 */
 	map->bucket_count = capacity + (NEIGHBORHOOD - 1);
-	return vdo_allocate(map->bucket_count, struct bucket,
-			    "struct int_map buckets", &map->buckets);
+	return vdo_allocate(map->bucket_count, "struct int_map buckets", &map->buckets);
 }
 
 /**
@@ -182,7 +181,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr)
 	int result;
 	size_t capacity;
 
-	result = vdo_allocate(1, struct int_map, "struct int_map", &map);
+	result = vdo_allocate(1, "struct int_map", &map);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index e26d75f8366d..0916c8609543 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -365,6 +365,33 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical,
 }
 
 /**
+ * vdo_submit_metadata_vio_wait() - Submit I/O for a metadata vio and wait for completion.
+ * @vio: the vio for which to issue I/O
+ * @physical: the physical block number to read or write
+ * @operation: the type of I/O to perform
+ *
+ * The function operates similarly to __submit_metadata_vio except that it will
+ * block until the work is done. It can be used to do i/o before work queues
+ * and thread completions are set up.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+int vdo_submit_metadata_vio_wait(struct vio *vio,
+				 physical_block_number_t physical,
+				 blk_opf_t operation)
+{
+	int result;
+
+	result = vio_reset_bio(vio, vio->data, NULL, operation | REQ_META, physical);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	bio_set_dev(vio->bio, vdo_get_backing_device(vio->completion.vdo));
+	submit_bio_wait(vio->bio);
+	return blk_status_to_errno(vio->bio->bi_status);
+}
+
+/**
  * vdo_make_io_submitter() - Create an io_submitter structure.
  * @thread_count: Number of bio-submission threads to set up.
  * @rotation_interval: Interval to use when rotating between bio-submission threads when enqueuing
@@ -383,8 +410,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 	struct io_submitter *io_submitter;
 	int result;
 
-	result = vdo_allocate_extended(struct io_submitter, thread_count,
-				       struct bio_queue_data, "bio submission data",
+	result = vdo_allocate_extended(thread_count, bio_queue_data, "bio submission data",
 				       &io_submitter);
 	if (result != VDO_SUCCESS)
 		return result;
diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h
index 3088f11055fd..0f320a60e9e8 100644
--- a/drivers/md/dm-vdo/io-submitter.h
+++ b/drivers/md/dm-vdo/io-submitter.h
@@ -56,4 +56,8 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback,
 			      REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0);
 }
 
+int vdo_submit_metadata_vio_wait(struct vio *vio,
+				 physical_block_number_t physical,
+				 blk_opf_t operation);
+
 #endif /* VDO_IO_SUBMITTER_H */
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index 0a27e60a9dfd..fa7c3eb7ee6b 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -94,8 +94,7 @@ int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = vdo_allocate_extended(struct logical_zones, zone_count,
-				       struct logical_zone, __func__, &zones);
+	result = vdo_allocate_extended(zone_count, zones, __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h
index 1b666c84a193..a36a864c6836 100644
--- a/drivers/md/dm-vdo/logical-zone.h
+++ b/drivers/md/dm-vdo/logical-zone.h
@@ -60,7 +60,7 @@ struct logical_zones {
 	/* The number of zones */
 	zone_count_t zone_count;
 	/* The logical zones themselves */
-	struct logical_zone zones[];
+	struct logical_zone zones[] __counted_by(zone_count);
 };
 
 int __must_check vdo_make_logical_zones(struct vdo *vdo,
diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c
index 185f259c7245..a7f07522110d 100644
--- a/drivers/md/dm-vdo/memory-alloc.c
+++ b/drivers/md/dm-vdo/memory-alloc.c
@@ -245,7 +245,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr)
 	} else {
 		struct vmalloc_block_info *block;
 
-		if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) {
+		if (vdo_allocate(1, __func__, &block) == VDO_SUCCESS) {
 			/*
 			 * It is possible for __vmalloc to fail to allocate memory because there
 			 * are no pages available. A short sleep may allow the page reclaimer
@@ -341,6 +341,7 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w
 			  void *new_ptr)
 {
 	int result;
+	char *temp_ptr;
 
 	if (size == 0) {
 		vdo_free(ptr);
@@ -348,9 +349,10 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w
 		return VDO_SUCCESS;
 	}
 
-	result = vdo_allocate(size, char, what, new_ptr);
+	result = vdo_allocate(size, what, &temp_ptr);
 	if (result != VDO_SUCCESS)
 		return result;
+	*(void **) new_ptr = temp_ptr;
 
 	if (ptr != NULL) {
 		if (old_size < size)
@@ -368,7 +370,7 @@ int vdo_duplicate_string(const char *string, const char *what, char **new_string
 	int result;
 	u8 *dup;
 
-	result = vdo_allocate(strlen(string) + 1, u8, what, &dup);
+	result = vdo_allocate(strlen(string) + 1, what, &dup);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h
index 0093d9f940d9..bc5527327ed8 100644
--- a/drivers/md/dm-vdo/memory-alloc.h
+++ b/drivers/md/dm-vdo/memory-alloc.h
@@ -8,6 +8,7 @@
 
 #include <linux/cache.h>
 #include <linux/io.h> /* for PAGE_SIZE */
+#include <linux/overflow.h>
 
 #include "permassert.h"
 #include "thread-registry.h"
@@ -16,86 +17,35 @@
 int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr);
 
 /*
- * Allocate storage based on element counts, sizes, and alignment.
- *
- * This is a generalized form of our allocation use case: It allocates an array of objects,
- * optionally preceded by one object of another type (i.e., a struct with trailing variable-length
- * array), with the alignment indicated.
- *
- * Why is this inline? The sizes and alignment will always be constant, when invoked through the
- * macros below, and often the count will be a compile-time constant 1 or the number of extra bytes
- * will be a compile-time constant 0. So at least some of the arithmetic can usually be optimized
- * away, and the run-time selection between allocation functions always can. In many cases, it'll
- * boil down to just a function call with a constant size.
- *
- * @count: The number of objects to allocate
- * @size: The size of an object
- * @extra: The number of additional bytes to allocate
- * @align: The required alignment
- * @what: What is being allocated (for error logging)
- * @ptr: A pointer to hold the allocated memory
- *
- * Return: VDO_SUCCESS or an error code
- */
-static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra,
-				      size_t align, const char *what, void *ptr)
-{
-	size_t total_size = count * size + extra;
-
-	/* Overflow check: */
-	if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) {
-		/*
-		 * This is kind of a hack: We rely on the fact that SIZE_MAX would cover the entire
-		 * address space (minus one byte) and thus the system can never allocate that much
-		 * and the call will always fail. So we can report an overflow as "out of memory"
-		 * by asking for "merely" SIZE_MAX bytes.
-		 */
-		total_size = SIZE_MAX;
-	}
-
-	return vdo_allocate_memory(total_size, align, what, ptr);
-}
-
-/*
  * Allocate one or more elements of the indicated type, logging an error if the allocation fails.
  * The memory will be zeroed.
  *
  * @COUNT: The number of objects to allocate
- * @TYPE: The type of objects to allocate. This type determines the alignment of the allocation.
  * @WHAT: What is being allocated (for error logging)
  * @PTR: A pointer to hold the allocated memory
  *
  * Return: VDO_SUCCESS or an error code
  */
-#define vdo_allocate(COUNT, TYPE, WHAT, PTR) \
-	__vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
+#define vdo_allocate(COUNT, WHAT, PTR)					\
+	vdo_allocate_memory(size_mul((COUNT), sizeof(typeof(**(PTR)))),	\
+			    __alignof__(typeof(**(PTR))), WHAT, PTR)
 
 /*
- * Allocate one object of an indicated type, followed by one or more elements of a second type,
- * logging an error if the allocation fails. The memory will be zeroed.
+ * Allocate a structure with a flexible array member, with a specified number of elements, logging
+ * an error if the allocation fails. The memory will be zeroed.
  *
- * @TYPE1: The type of the primary object to allocate. This type determines the alignment of the
- *         allocated memory.
  * @COUNT: The number of objects to allocate
- * @TYPE2: The type of array objects to allocate
+ * @FIELD: The flexible array field at the end of the structure
  * @WHAT: What is being allocated (for error logging)
  * @PTR: A pointer to hold the allocated memory
  *
  * Return: VDO_SUCCESS or an error code
  */
-#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR)		\
-	__extension__({							\
-		int _result;						\
-		TYPE1 **_ptr = (PTR);					\
-		BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2));	\
-		_result = __vdo_do_allocation(COUNT,			\
-					      sizeof(TYPE2),		\
-					      sizeof(TYPE1),		\
-					      __alignof__(TYPE1),	\
-					      WHAT,			\
-					      _ptr);			\
-		_result;						\
-	})
+#define vdo_allocate_extended(COUNT, FIELD, WHAT, PTR)			\
+	vdo_allocate_memory(struct_size(*(PTR), FIELD, (COUNT)),	\
+			    __alignof__(typeof(**(PTR))),		\
+			    WHAT,					\
+			    (PTR))
 
 /*
  * Allocate memory starting on a cache line boundary, logging an error if the allocation fails. The
diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c
index 75dfcd7c5f63..b4c919780c22 100644
--- a/drivers/md/dm-vdo/message-stats.c
+++ b/drivers/md/dm-vdo/message-stats.c
@@ -420,7 +420,7 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen)
 	struct vdo_statistics *stats;
 	int result;
 
-	result = vdo_allocate(1, struct vdo_statistics, __func__, &stats);
+	result = vdo_allocate(1, __func__, &stats);
 	if (result != VDO_SUCCESS) {
 		vdo_log_error("Cannot allocate memory to write VDO statistics");
 		return result;
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index 666be6d557e1..ea2d8d14495c 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -120,8 +120,7 @@ static int __must_check make_bin(struct packer *packer)
 	struct packer_bin *bin;
 	int result;
 
-	result = vdo_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS,
-				       struct vio *, __func__, &bin);
+	result = vdo_allocate_extended(VDO_MAX_COMPRESSION_SLOTS, incoming, __func__, &bin);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -146,7 +145,7 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa
 	block_count_t i;
 	int result;
 
-	result = vdo_allocate(1, struct packer, __func__, &packer);
+	result = vdo_allocate(1, __func__, &packer);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -168,8 +167,8 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa
 	 * bin must have a canceler for which it is waiting, and any canceler will only have
 	 * canceled one lock holder at a time.
 	 */
-	result = vdo_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2,
-				       struct vio *, __func__, &packer->canceled_bin);
+	result = vdo_allocate_extended(MAXIMUM_VDO_USER_VIOS / 2, incoming, __func__,
+				       &packer->canceled_bin);
 	if (result != VDO_SUCCESS) {
 		vdo_free_packer(packer);
 		return result;
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index 686eb7d714e6..d6ad8f1a33bb 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -200,7 +200,7 @@ struct pbn_lock_pool {
 	/** @idle_list: A list containing all idle PBN lock instances. */
 	struct list_head idle_list;
 	/** @locks: The memory for all the locks allocated by this pool. */
-	idle_pbn_lock locks[];
+	idle_pbn_lock locks[] __counted_by(capacity);
 };
 
 /**
@@ -240,8 +240,7 @@ static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr)
 	struct pbn_lock_pool *pool;
 	int result;
 
-	result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock,
-				       __func__, &pool);
+	result = vdo_allocate_extended(capacity, locks, __func__, &pool);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -368,8 +367,7 @@ int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr)
 	if (zone_count == 0)
 		return VDO_SUCCESS;
 
-	result = vdo_allocate_extended(struct physical_zones, zone_count,
-				       struct physical_zone, __func__, &zones);
+	result = vdo_allocate_extended(zone_count, zones, __func__, &zones);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c
index 9bae8256ba4e..bb8a878ce4e5 100644
--- a/drivers/md/dm-vdo/priority-table.c
+++ b/drivers/md/dm-vdo/priority-table.c
@@ -60,8 +60,7 @@ int vdo_make_priority_table(unsigned int max_priority, struct priority_table **t
 	if (max_priority > MAX_PRIORITY)
 		return UDS_INVALID_ARGUMENT;
 
-	result = vdo_allocate_extended(struct priority_table, max_priority + 1,
-				       struct bucket, __func__, &table);
+	result = vdo_allocate_extended(max_priority + 1, buckets, __func__, &table);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index 9cc0f0ff1664..f03939cc89e3 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -593,32 +593,29 @@ static int __must_check initialize_lock_counter(struct recovery_journal *journal
 	struct thread_config *config = &vdo->thread_config;
 	struct lock_counter *counter = &journal->lock_counter;
 
-	result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters);
+	result = vdo_allocate(journal->size, __func__, &counter->journal_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(journal->size, atomic_t, __func__,
-			      &counter->journal_decrement_counts);
+	result = vdo_allocate(journal->size, __func__, &counter->journal_decrement_counts);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__,
+	result = vdo_allocate(journal->size * config->logical_zone_count, __func__,
 			      &counter->logical_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(journal->size, atomic_t, __func__,
-			      &counter->logical_zone_counts);
+	result = vdo_allocate(journal->size, __func__, &counter->logical_zone_counts);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__,
+	result = vdo_allocate(journal->size * config->physical_zone_count, __func__,
 			      &counter->physical_counters);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(journal->size, atomic_t, __func__,
-			      &counter->physical_zone_counts);
+	result = vdo_allocate(journal->size, __func__, &counter->physical_zone_counts);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -672,7 +669,7 @@ static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *j
 	 * Allocate a full block for the journal block even though not all of the space is used
 	 * since the VIO needs to write a full disk block.
 	 */
-	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data);
+	result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &data);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -711,10 +708,8 @@ int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t
 	struct recovery_journal *journal;
 	int result;
 
-	result = vdo_allocate_extended(struct recovery_journal,
-				       RECOVERY_JOURNAL_RESERVED_BLOCKS,
-				       struct recovery_journal_block, __func__,
-				       &journal);
+	result = vdo_allocate_extended(RECOVERY_JOURNAL_RESERVED_BLOCKS, blocks,
+				       __func__, &journal);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c
index 8c006fb3afcf..bfed62260280 100644
--- a/drivers/md/dm-vdo/repair.c
+++ b/drivers/md/dm-vdo/repair.c
@@ -127,7 +127,7 @@ struct repair_completion {
 	 * The page completions used for playing the journal into the block map, and, during
 	 * read-only rebuild, for rebuilding the reference counts from the block map.
 	 */
-	struct vdo_page_completion page_completions[];
+	struct vdo_page_completion page_completions[] __counted_by(page_count);
 };
 
 /*
@@ -1417,8 +1417,7 @@ static int parse_journal_for_rebuild(struct repair_completion *repair)
 	 * packed_recovery_journal_entry from every valid journal block.
 	 */
 	count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
-	result = vdo_allocate(count, struct numbered_block_mapping, __func__,
-			      &repair->entries);
+	result = vdo_allocate(count, __func__, &repair->entries);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1464,8 +1463,7 @@ static int extract_new_mappings(struct repair_completion *repair)
 	 * Allocate an array of numbered_block_mapping structs just large enough to transcribe
 	 * every packed_recovery_journal_entry from every valid journal block.
 	 */
-	result = vdo_allocate(repair->entry_count, struct numbered_block_mapping,
-			      __func__, &repair->entries);
+	result = vdo_allocate(repair->entry_count, __func__, &repair->entries);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -1715,9 +1713,7 @@ void vdo_repair(struct vdo_completion *parent)
 		vdo_log_warning("Device was dirty, rebuilding reference counts");
 	}
 
-	result = vdo_allocate_extended(struct repair_completion, page_count,
-				       struct vdo_page_completion, __func__,
-				       &repair);
+	result = vdo_allocate_extended(page_count, page_completions, __func__, &repair);
 	if (result != VDO_SUCCESS) {
 		vdo_fail_completion(parent, result);
 		return;
@@ -1729,12 +1725,11 @@ void vdo_repair(struct vdo_completion *parent)
 	prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
 	repair->page_count = page_count;
 
-	result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
-			      &repair->journal_data);
+	result = vdo_allocate(remaining * VDO_BLOCK_SIZE, __func__, &repair->journal_data);
 	if (abort_on_error(result, repair))
 		return;
 
-	result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios);
+	result = vdo_allocate(vio_count, __func__, &repair->vios);
 	if (abort_on_error(result, repair))
 		return;
 
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index 034ecaa51f48..7fcbb361b38d 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -2453,8 +2453,7 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(slab->reference_block_count, struct reference_block,
-			      __func__, &slab->reference_blocks);
+	result = vdo_allocate(slab->reference_block_count, __func__, &slab->reference_blocks);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -2463,8 +2462,7 @@ static int allocate_slab_counters(struct vdo_slab *slab)
 	 * so we can word-search even at the very end.
 	 */
 	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
-	result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
-			      &slab->counters);
+	result = vdo_allocate(bytes, "ref counts array", &slab->counters);
 	if (result != VDO_SUCCESS) {
 		vdo_free(vdo_forget(slab->reference_blocks));
 		return result;
@@ -3563,8 +3561,7 @@ static int get_slab_statuses(struct block_allocator *allocator,
 	struct slab_status *statuses;
 	struct slab_iterator iterator = get_slab_iterator(allocator);
 
-	result = vdo_allocate(allocator->slab_count, struct slab_status, __func__,
-			      &statuses);
+	result = vdo_allocate(allocator->slab_count, __func__, &statuses);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -3739,13 +3736,12 @@ static int initialize_slab_journal(struct vdo_slab *slab)
 	const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
 	int result;
 
-	result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock,
-			      __func__, &journal->locks);
+	result = vdo_allocate(slab_config->slab_journal_blocks, __func__, &journal->locks);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
-			      (char **) &journal->block);
+	BUILD_BUG_ON(sizeof(*journal->block) != VDO_BLOCK_SIZE);
+	result = vdo_allocate(1, "struct packed_slab_journal_block", &journal->block);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -3800,7 +3796,7 @@ static int __must_check make_slab(physical_block_number_t slab_origin,
 	struct vdo_slab *slab;
 	int result;
 
-	result = vdo_allocate(1, struct vdo_slab, __func__, &slab);
+	result = vdo_allocate(1, __func__, &slab);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -3857,8 +3853,7 @@ static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
 	physical_block_number_t slab_origin;
 	int result;
 
-	result = vdo_allocate(slab_count, struct vdo_slab *,
-			      "slab pointer array", &depot->new_slabs);
+	result = vdo_allocate(slab_count, "slab pointer array", &depot->new_slabs);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -4011,8 +4006,7 @@ static int initialize_slab_scrubber(struct block_allocator *allocator)
 	char *journal_data;
 	int result;
 
-	result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size,
-			      char, __func__, &journal_data);
+	result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, __func__, &journal_data);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -4045,7 +4039,7 @@ static int __must_check initialize_slab_summary_block(struct block_allocator *al
 	struct slab_summary_block *block = &allocator->summary_blocks[index];
 	int result;
 
-	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
+	result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &block->outgoing_entries);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -4114,8 +4108,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
-			      struct slab_summary_block, __func__,
+	result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, __func__,
 			      &allocator->summary_blocks);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -4174,8 +4167,7 @@ static int allocate_components(struct slab_depot *depot,
 
 	depot->summary_origin = summary_partition->offset;
 	depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
-	result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
-			      struct slab_summary_entry, __func__,
+	result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, __func__,
 			      &depot->summary_entries);
 	if (result != VDO_SUCCESS)
 		return result;
@@ -4262,9 +4254,12 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
 	}
 	slab_size_shift = ilog2(slab_size);
 
-	result = vdo_allocate_extended(struct slab_depot,
-				       vdo->thread_config.physical_zone_count,
-				       struct block_allocator, __func__, &depot);
+	if (state.zone_count > MAX_VDO_PHYSICAL_ZONES)
+		return vdo_log_error_strerror(UDS_CORRUPT_DATA,
+					      "invalid zone count");
+
+	result = vdo_allocate_extended(vdo->thread_config.physical_zone_count,
+				       allocators, __func__, &depot);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
index fadc0c9d4dc4..6bfd61c937b6 100644
--- a/drivers/md/dm-vdo/slab-depot.h
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -509,7 +509,7 @@ struct slab_depot {
 	struct slab_summary_entry *summary_entries;
 
 	/* The block allocators for this depot */
-	struct block_allocator allocators[];
+	struct block_allocator allocators[] __counted_by(zone_count);
 };
 
 struct reference_updater;
diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c
index dd252d660b6d..9df5e4d7f884 100644
--- a/drivers/md/dm-vdo/status-codes.c
+++ b/drivers/md/dm-vdo/status-codes.c
@@ -80,6 +80,8 @@ int vdo_status_to_errno(int error)
 
 	/* VDO or UDS error */
 	switch (error) {
+	case VDO_BAD_CONFIGURATION:
+		return -EINVAL;
 	case VDO_NO_SPACE:
 		return -ENOSPC;
 	case VDO_READ_ONLY:
diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c
index ec08478dd013..826afc952b56 100644
--- a/drivers/md/dm-vdo/thread-utils.c
+++ b/drivers/md/dm-vdo/thread-utils.c
@@ -56,7 +56,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data,
 	struct thread *thread;
 	int result;
 
-	result = vdo_allocate(1, struct thread, __func__, &thread);
+	result = vdo_allocate(1, __func__, &thread);
 	if (result != VDO_SUCCESS) {
 		vdo_log_warning("Error allocating memory for %s", name);
 		return result;
diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h
index cdf36e7d7702..0d60a88aa086 100644
--- a/drivers/md/dm-vdo/types.h
+++ b/drivers/md/dm-vdo/types.h
@@ -227,6 +227,9 @@ struct device_config {
 	bool compression;
 	struct thread_count_config thread_counts;
 	block_count_t max_discard_blocks;
+	block_count_t slab_blocks;
+	int index_memory;
+	bool index_sparse;
 };
 
 enum vdo_completion_type {
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index 09fd0628d18c..7bec2418c121 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -34,7 +34,9 @@
 #include <linux/lz4.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/string.h>
 #include <linux/types.h>
+#include <linux/uuid.h>
 
 #include "logger.h"
 #include "memory-alloc.h"
@@ -55,6 +57,7 @@
 #include "slab-depot.h"
 #include "statistics.h"
 #include "status-codes.h"
+#include "time-utils.h"
 #include "vio.h"
 
 #define PARANOID_THREAD_CONSISTENCY_CHECKS 0
@@ -207,29 +210,28 @@ static int __must_check initialize_thread_config(struct thread_count_config coun
 		config->hash_zone_count = counts.hash_zones;
 	}
 
-	result = vdo_allocate(config->logical_zone_count, thread_id_t,
-			      "logical thread array", &config->logical_threads);
+	result = vdo_allocate(config->logical_zone_count, "logical thread array",
+			      &config->logical_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = vdo_allocate(config->physical_zone_count, thread_id_t,
-			      "physical thread array", &config->physical_threads);
+	result = vdo_allocate(config->physical_zone_count, "physical thread array",
+			      &config->physical_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = vdo_allocate(config->hash_zone_count, thread_id_t,
-			      "hash thread array", &config->hash_zone_threads);
+	result = vdo_allocate(config->hash_zone_count, "hash thread array",
+			      &config->hash_zone_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
 	}
 
-	result = vdo_allocate(config->bio_thread_count, thread_id_t,
-			      "bio thread array", &config->bio_threads);
+	result = vdo_allocate(config->bio_thread_count, "bio thread array", &config->bio_threads);
 	if (result != VDO_SUCCESS) {
 		uninitialize_thread_config(config);
 		return result;
@@ -256,56 +258,35 @@ static int __must_check initialize_thread_config(struct thread_count_config coun
 	return VDO_SUCCESS;
 }
 
-/**
- * read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block
- *                         device.
- * @vdo: The vdo whose geometry is to be read.
- *
- * Return: VDO_SUCCESS or an error code.
- */
-static int __must_check read_geometry_block(struct vdo *vdo)
+static int initialize_geometry_block(struct vdo *vdo,
+				     struct vdo_geometry_block *geometry_block)
 {
-	struct vio *vio;
-	char *block;
 	int result;
 
-	result = vdo_allocate(VDO_BLOCK_SIZE, u8, __func__, &block);
+	result = vdo_allocate(VDO_BLOCK_SIZE, "encoded geometry block",
+			      (char **) &vdo->geometry_block.buffer);
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL,
-				     block, &vio);
-	if (result != VDO_SUCCESS) {
-		vdo_free(block);
-		return result;
-	}
+	return allocate_vio_components(vdo, VIO_TYPE_GEOMETRY,
+				       VIO_PRIORITY_METADATA, NULL, 1,
+				       (char *) geometry_block->buffer,
+				       &vdo->geometry_block.vio);
+}
 
-	/*
-	 * This is only safe because, having not already loaded the geometry, the vdo's geometry's
-	 * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from
-	 * the supplied pbn is not a problem.
-	 */
-	result = vio_reset_bio(vio, block, NULL, REQ_OP_READ,
-			       VDO_GEOMETRY_BLOCK_LOCATION);
-	if (result != VDO_SUCCESS) {
-		free_vio(vdo_forget(vio));
-		vdo_free(block);
-		return result;
-	}
+static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block)
+{
+	int result;
 
-	bio_set_dev(vio->bio, vdo_get_backing_device(vdo));
-	submit_bio_wait(vio->bio);
-	result = blk_status_to_errno(vio->bio->bi_status);
-	free_vio(vdo_forget(vio));
-	if (result != 0) {
-		vdo_log_error_strerror(result, "synchronous read failed");
-		vdo_free(block);
-		return -EIO;
-	}
+	result = vdo_allocate(VDO_BLOCK_SIZE, "encoded super block",
+			      (char **) &vdo->super_block.buffer);
+	if (result != VDO_SUCCESS)
+		return result;
 
-	result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry);
-	vdo_free(block);
-	return result;
+	return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK,
+				       VIO_PRIORITY_METADATA, NULL, 1,
+				       (char *) super_block->buffer,
+				       &vdo->super_block.vio);
 }
 
 static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count,
@@ -453,6 +434,69 @@ static int register_vdo(struct vdo *vdo)
 }
 
 /**
+ * vdo_format() - Format a block device to function as a new VDO.
+ * @vdo:       The vdo to format.
+ * @error_ptr: The reason for any failure during this call.
+ *
+ * This function must be called on a device before a VDO can be loaded for the first time.
+ * Once a device has been formatted, the VDO can be loaded and shut down repeatedly.
+ * If a new VDO is desired, this function should be called again.
+ *
+ * Return: VDO_SUCCESS or an error
+ **/
+static int __must_check vdo_format(struct vdo *vdo, char **error_ptr)
+{
+	int result;
+	uuid_t uuid;
+	nonce_t nonce = current_time_us();
+	struct device_config *config = vdo->device_config;
+
+	struct index_config index_config = {
+		.mem    = config->index_memory,
+		.sparse = config->index_sparse,
+	};
+
+	struct vdo_config vdo_config = {
+		.logical_blocks        = config->logical_blocks,
+		.physical_blocks       = config->physical_blocks,
+		.slab_size             = config->slab_blocks,
+		.slab_journal_blocks   = DEFAULT_VDO_SLAB_JOURNAL_SIZE,
+		.recovery_journal_size = DEFAULT_VDO_RECOVERY_JOURNAL_SIZE,
+	};
+
+	uuid_gen(&uuid);
+	result = vdo_initialize_volume_geometry(nonce, &uuid, &index_config, &vdo->geometry);
+	if (result != VDO_SUCCESS) {
+		*error_ptr = "Could not initialize volume geometry during format";
+		return result;
+	}
+
+	result = vdo_initialize_component_states(&vdo_config, &vdo->geometry, nonce, &vdo->states);
+	if (result == VDO_NO_SPACE) {
+		block_count_t slab_blocks = config->slab_blocks;
+		/* 1 is counting geometry block */
+		block_count_t fixed_layout_size = 1 +
+			vdo->geometry.regions[VDO_DATA_REGION].start_block +
+			DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT +
+			DEFAULT_VDO_RECOVERY_JOURNAL_SIZE + VDO_SLAB_SUMMARY_BLOCKS;
+		block_count_t necessary_size = fixed_layout_size + slab_blocks;
+
+		vdo_log_error("Minimum required size for VDO volume: %llu bytes",
+			      (unsigned long long) necessary_size * VDO_BLOCK_SIZE);
+		*error_ptr = "Could not allocate enough space for VDO during format";
+		return result;
+	}
+	if (result != VDO_SUCCESS) {
+		*error_ptr = "Could not initialize data layout during format";
+		return result;
+	}
+
+	vdo->needs_formatting = true;
+
+	return VDO_SUCCESS;
+}
+
+/**
  * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on
  *                    error.
  * @vdo: The vdo being initialized
@@ -475,12 +519,39 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 	vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION);
 	init_completion(&vdo->admin.callback_sync);
 	mutex_init(&vdo->stats_mutex);
-	result = read_geometry_block(vdo);
+
+	result = initialize_geometry_block(vdo, &vdo->geometry_block);
+	if (result != VDO_SUCCESS) {
+		*reason = "Could not initialize geometry block";
+		return result;
+	}
+
+	result = initialize_super_block(vdo, &vdo->super_block);
+	if (result != VDO_SUCCESS) {
+		*reason = "Could not initialize super block";
+		return result;
+	}
+
+	result = vdo_submit_metadata_vio_wait(&vdo->geometry_block.vio,
+					      VDO_GEOMETRY_BLOCK_LOCATION, REQ_OP_READ);
 	if (result != VDO_SUCCESS) {
 		*reason = "Could not load geometry block";
 		return result;
 	}
 
+	if (mem_is_zero(vdo->geometry_block.vio.data, VDO_BLOCK_SIZE)) {
+		result = vdo_format(vdo, reason);
+		if (result != VDO_SUCCESS)
+			return result;
+	} else {
+		result = vdo_parse_geometry_block(vdo->geometry_block.buffer,
+						  &vdo->geometry);
+		if (result != VDO_SUCCESS) {
+			*reason = "Could not parse geometry block";
+			return result;
+		}
+	}
+
 	result = initialize_thread_config(config->thread_counts, &vdo->thread_config);
 	if (result != VDO_SUCCESS) {
 		*reason = "Cannot create thread configuration";
@@ -493,7 +564,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 		     config->thread_counts.hash_zones, vdo->thread_config.thread_count);
 
 	/* Compression context storage */
-	result = vdo_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context",
+	result = vdo_allocate(config->thread_counts.cpu_threads, "LZ4 context",
 			      &vdo->compression_context);
 	if (result != VDO_SUCCESS) {
 		*reason = "cannot allocate LZ4 context";
@@ -501,7 +572,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config,
 	}
 
 	for (i = 0; i < config->thread_counts.cpu_threads; i++) {
-		result = vdo_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context",
+		result = vdo_allocate(LZ4_MEM_COMPRESS, "LZ4 context",
 				      &vdo->compression_context[i]);
 		if (result != VDO_SUCCESS) {
 			*reason = "cannot allocate LZ4 context";
@@ -537,7 +608,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 	/* Initialize with a generic failure reason to prevent returning garbage. */
 	*reason = "Unspecified error";
 
-	result = vdo_allocate(1, struct vdo, __func__, &vdo);
+	result = vdo_allocate(1, __func__, &vdo);
 	if (result != VDO_SUCCESS) {
 		*reason = "Cannot allocate VDO";
 		return result;
@@ -554,8 +625,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
 
 	snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
 		 "vdo%u", instance);
-	result = vdo_allocate(vdo->thread_config.thread_count,
-			      struct vdo_thread, __func__, &vdo->threads);
+	result = vdo_allocate(vdo->thread_config.thread_count, __func__, &vdo->threads);
 	if (result != VDO_SUCCESS) {
 		*reason = "Cannot allocate thread structures";
 		return result;
@@ -648,6 +718,12 @@ static void free_listeners(struct vdo_thread *thread)
 	}
 }
 
+static void uninitialize_geometry_block(struct vdo_geometry_block *geometry_block)
+{
+	free_vio_components(&geometry_block->vio);
+	vdo_free(geometry_block->buffer);
+}
+
 static void uninitialize_super_block(struct vdo_super_block *super_block)
 {
 	free_vio_components(&super_block->vio);
@@ -695,6 +771,7 @@ void vdo_destroy(struct vdo *vdo)
 	vdo_uninitialize_layout(&vdo->next_layout);
 	if (vdo->partition_copier)
 		dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier));
+	uninitialize_geometry_block(&vdo->geometry_block);
 	uninitialize_super_block(&vdo->super_block);
 	vdo_free_block_map(vdo_forget(vdo->block_map));
 	vdo_free_hash_zones(vdo_forget(vdo->hash_zones));
@@ -720,21 +797,6 @@ void vdo_destroy(struct vdo *vdo)
 	vdo_free(vdo);
 }
 
-static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block)
-{
-	int result;
-
-	result = vdo_allocate(VDO_BLOCK_SIZE, char, "encoded super block",
-			      (char **) &vdo->super_block.buffer);
-	if (result != VDO_SUCCESS)
-		return result;
-
-	return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK,
-				       VIO_PRIORITY_METADATA, NULL, 1,
-				       (char *) super_block->buffer,
-				       &vdo->super_block.vio);
-}
-
 /**
  * finish_reading_super_block() - Continue after loading the super block.
  * @completion: The super block vio.
@@ -778,14 +840,6 @@ static void read_super_block_endio(struct bio *bio)
  */
 void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent)
 {
-	int result;
-
-	result = initialize_super_block(vdo, &vdo->super_block);
-	if (result != VDO_SUCCESS) {
-		vdo_continue_completion(parent, result);
-		return;
-	}
-
 	vdo->super_block.vio.completion.parent = parent;
 	vdo_submit_metadata_vio(&vdo->super_block.vio,
 				vdo_get_data_region_start(vdo->geometry),
@@ -899,24 +953,101 @@ static void record_vdo(struct vdo *vdo)
 	vdo->states.layout = vdo->layout;
 }
 
+static int __must_check clear_partition(struct vdo *vdo, enum partition_id id)
+{
+	struct partition *partition;
+	int result;
+
+	result = vdo_get_partition(&vdo->states.layout, id, &partition);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return blkdev_issue_zeroout(vdo_get_backing_device(vdo),
+				    partition->offset * VDO_SECTORS_PER_BLOCK,
+				    partition->count * VDO_SECTORS_PER_BLOCK,
+				    GFP_NOWAIT, 0);
+}
+
+int vdo_clear_layout(struct vdo *vdo)
+{
+	int result;
+
+	/* Zero out the uds index's first block. */
+	result = blkdev_issue_zeroout(vdo_get_backing_device(vdo),
+				      VDO_SECTORS_PER_BLOCK,
+				      VDO_SECTORS_PER_BLOCK,
+				      GFP_NOWAIT, 0);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = clear_partition(vdo, VDO_BLOCK_MAP_PARTITION);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	return clear_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION);
+}
+
 /**
- * continue_super_block_parent() - Continue the parent of a super block save operation.
- * @completion: The super block vio.
+ * continue_parent() - Continue the parent of a save operation.
+ * @completion: The completion to continue.
  *
- * This callback is registered in vdo_save_components().
  */
-static void continue_super_block_parent(struct vdo_completion *completion)
+static void continue_parent(struct vdo_completion *completion)
 {
 	vdo_continue_completion(vdo_forget(completion->parent), completion->result);
 }
 
+static void handle_write_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct vdo_completion *parent = vio->completion.parent;
+
+	continue_vio_after_io(vio, continue_parent,
+			      parent->callback_thread_id);
+}
+
 /**
- * handle_save_error() - Log a super block save error.
+ * handle_geometry_block_save_error() - Log a geometry block save error.
+ * @completion: The super block vio.
+ *
+ * This error handler is registered in vdo_save_geometry_block().
+ */
+static void handle_geometry_block_save_error(struct vdo_completion *completion)
+{
+	struct vdo_geometry_block *geometry_block =
+		container_of(as_vio(completion), struct vdo_geometry_block, vio);
+
+	vio_record_metadata_io_error(&geometry_block->vio);
+	vdo_log_error_strerror(completion->result, "geometry block save failed");
+	completion->callback(completion);
+}
+
+/**
+ * vdo_save_geometry_block() - Encode the vdo and save the geometry block asynchronously.
+ * @vdo: The vdo whose state is being saved.
+ * @parent: The completion to notify when the save is complete.
+ */
+void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent)
+{
+	struct vdo_geometry_block *geometry_block = &vdo->geometry_block;
+
+	vdo_encode_volume_geometry(geometry_block->buffer, &vdo->geometry,
+				   VDO_DEFAULT_GEOMETRY_BLOCK_VERSION);
+	geometry_block->vio.completion.parent = parent;
+	geometry_block->vio.completion.callback_thread_id = parent->callback_thread_id;
+	vdo_submit_metadata_vio(&geometry_block->vio,
+				VDO_GEOMETRY_BLOCK_LOCATION,
+				handle_write_endio, handle_geometry_block_save_error,
+				REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA);
+}
+
+/**
+ * handle_super_block_save_error() - Log a super block save error.
  * @completion: The super block vio.
  *
  * This error handler is registered in vdo_save_components().
  */
-static void handle_save_error(struct vdo_completion *completion)
+static void handle_super_block_save_error(struct vdo_completion *completion)
 {
 	struct vdo_super_block *super_block =
 		container_of(as_vio(completion), struct vdo_super_block, vio);
@@ -935,17 +1066,27 @@ static void handle_save_error(struct vdo_completion *completion)
 	completion->callback(completion);
 }
 
-static void super_block_write_endio(struct bio *bio)
+/**
+ * vdo_save_super_block() - Save the component states to the super block asynchronously.
+ * @vdo: The vdo whose state is being saved.
+ * @parent: The completion to notify when the save is complete.
+ */
+void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent)
 {
-	struct vio *vio = bio->bi_private;
-	struct vdo_completion *parent = vio->completion.parent;
+	struct vdo_super_block *super_block = &vdo->super_block;
 
-	continue_vio_after_io(vio, continue_super_block_parent,
-			      parent->callback_thread_id);
+	vdo_encode_super_block(super_block->buffer, &vdo->states);
+	super_block->vio.completion.parent = parent;
+	super_block->vio.completion.callback_thread_id = parent->callback_thread_id;
+	vdo_submit_metadata_vio(&super_block->vio,
+				vdo_get_data_region_start(vdo->geometry),
+				handle_write_endio, handle_super_block_save_error,
+				REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA);
 }
 
 /**
- * vdo_save_components() - Encode the vdo and save the super block asynchronously.
+ * vdo_save_components() - Copy the current state of the VDO to the states struct and save
+ *                         it to the super block asynchronously.
  * @vdo: The vdo whose state is being saved.
  * @parent: The completion to notify when the save is complete.
  */
@@ -964,14 +1105,7 @@ void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent)
 	}
 
 	record_vdo(vdo);
-
-	vdo_encode_super_block(super_block->buffer, &vdo->states);
-	super_block->vio.completion.parent = parent;
-	super_block->vio.completion.callback_thread_id = parent->callback_thread_id;
-	vdo_submit_metadata_vio(&super_block->vio,
-				vdo_get_data_region_start(vdo->geometry),
-				super_block_write_endio, handle_save_error,
-				REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA);
+	vdo_save_super_block(vdo, parent);
 }
 
 /**
@@ -997,8 +1131,7 @@ int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
 	if (result != VDO_SUCCESS)
 		return result;
 
-	result = vdo_allocate(1, struct read_only_listener, __func__,
-			      &read_only_listener);
+	result = vdo_allocate(1, __func__, &read_only_listener);
 	if (result != VDO_SUCCESS)
 		return result;
 
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
index 1aaba73997b7..9a63f5d45ce3 100644
--- a/drivers/md/dm-vdo/vdo.h
+++ b/drivers/md/dm-vdo/vdo.h
@@ -144,6 +144,13 @@ struct thread_config {
 
 struct thread_count_config;
 
+struct vdo_geometry_block {
+	/* The vio for reading and writing the geometry block to disk */
+	struct vio vio;
+	/* A buffer to hold the geometry block */
+	u8 *buffer;
+};
+
 struct vdo_super_block {
 	/* The vio for reading and writing the super block to disk */
 	struct vio vio;
@@ -186,6 +193,9 @@ struct vdo {
 	/* The thread mapping */
 	struct thread_config thread_config;
 
+	/* The geometry block */
+	struct vdo_geometry_block geometry_block;
+
 	/* The super block */
 	struct vdo_super_block super_block;
 
@@ -236,6 +246,7 @@ struct vdo {
 	const struct admin_state_code *suspend_type;
 	bool allocations_allowed;
 	bool dump_on_shutdown;
+	bool needs_formatting;
 	atomic_t processing_message;
 
 	/*
@@ -304,6 +315,10 @@ int __must_check vdo_make(unsigned int instance, struct device_config *config,
 
 void vdo_destroy(struct vdo *vdo);
 
+int __must_check vdo_format_components(struct vdo *vdo);
+
+void vdo_format_super_block(struct vdo *vdo, struct vdo_completion *parent);
+
 void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent);
 
 struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo);
@@ -326,6 +341,10 @@ enum vdo_state __must_check vdo_get_state(const struct vdo *vdo);
 
 void vdo_set_state(struct vdo *vdo, enum vdo_state state);
 
+int vdo_clear_layout(struct vdo *vdo);
+void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent);
+void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent);
+
 void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent);
 
 int vdo_register_read_only_listener(struct vdo *vdo, void *listener,
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index 5ffc867d9c5e..ea8ac619ff1b 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -52,8 +52,8 @@ static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr)
 	struct bio *bio = NULL;
 	int result;
 
-	result = vdo_allocate_extended(struct bio, size + 1, struct bio_vec,
-				       "bio", &bio);
+	result = vdo_allocate_memory(sizeof(struct bio) + sizeof(struct bio_vec) * (size + 1),
+				     __alignof__(struct bio), "bio", &bio);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -129,7 +129,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type,
 	 * Metadata vios should use direct allocation and not use the buffer pool, which is
 	 * reserved for submissions from the linux block layer.
 	 */
-	result = vdo_allocate(1, struct vio, __func__, &vio);
+	result = vdo_allocate(1, __func__, &vio);
 	if (result != VDO_SUCCESS) {
 		vdo_log_error("metadata vio allocation failure %d", result);
 		return result;
@@ -327,8 +327,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_
 	int result;
 	size_t per_vio_size = VDO_BLOCK_SIZE * block_count;
 
-	result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio,
-				       __func__, &pool);
+	result = vdo_allocate_extended(pool_size, vios, __func__, &pool);
 	if (result != VDO_SUCCESS)
 		return result;
 
@@ -336,8 +335,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_
 	INIT_LIST_HEAD(&pool->available);
 	INIT_LIST_HEAD(&pool->busy);
 
-	result = vdo_allocate(pool_size * per_vio_size, char,
-			      "VIO pool buffer", &pool->buffer);
+	result = vdo_allocate(pool_size * per_vio_size, "VIO pool buffer", &pool->buffer);
 	if (result != VDO_SUCCESS) {
 		free_vio_pool(pool);
 		return result;
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 14be4d888af3..85ad9dc210ff 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -11,163 +11,131 @@
 #define DM_MSG_PREFIX	"verity-fec"
 
 /*
- * When correcting a data block, the FEC code performs optimally when it can
- * collect all the associated RS blocks at the same time.  As each byte is part
- * of a different RS block, there are '1 << data_dev_block_bits' RS blocks.
- * There are '1 << DM_VERITY_FEC_BUF_RS_BITS' RS blocks per buffer, so that
- * gives '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers.
+ * When correcting a block, the FEC implementation performs optimally when it
+ * can collect all the associated RS codewords at the same time.  As each byte
+ * is part of a different codeword, there are '1 << data_dev_block_bits'
+ * codewords.  Each buffer has space for the message bytes for
+ * '1 << DM_VERITY_FEC_BUF_RS_BITS' codewords, so that gives
+ * '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers.
  */
 static inline unsigned int fec_max_nbufs(struct dm_verity *v)
 {
 	return 1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS);
 }
 
-/*
- * Return an interleaved offset for a byte in RS block.
- */
-static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
-{
-	u32 mod;
-
-	mod = do_div(offset, v->fec->rsn);
-	return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
-}
-
-/*
- * Read error-correcting codes for the requested RS block. Returns a pointer
- * to the data block. Caller is responsible for releasing buf.
- */
-static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
-			   unsigned int *offset, unsigned int par_buf_offset,
-			   struct dm_buffer **buf, unsigned short ioprio)
-{
-	u64 position, block, rem;
-	u8 *res;
-
-	/* We have already part of parity bytes read, skip to the next block */
-	if (par_buf_offset)
-		index++;
-
-	position = (index + rsb) * v->fec->roots;
-	block = div64_u64_rem(position, v->fec->io_size, &rem);
-	*offset = par_buf_offset ? 0 : (unsigned int)rem;
-
-	res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio);
-	if (IS_ERR(res)) {
-		DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
-		      v->data_dev->name, (unsigned long long)rsb,
-		      (unsigned long long)block, PTR_ERR(res));
-		*buf = NULL;
-	}
-
-	return res;
-}
-
 /* Loop over each allocated buffer. */
 #define fec_for_each_buffer(io, __i) \
 	for (__i = 0; __i < (io)->nbufs; __i++)
 
-/* Loop over each RS block in each allocated buffer. */
-#define fec_for_each_buffer_rs_block(io, __i, __j) \
+/* Loop over each RS message in each allocated buffer. */
+/* To stop early, use 'goto', not 'break' (since this uses nested loops). */
+#define fec_for_each_buffer_rs_message(io, __i, __j) \
 	fec_for_each_buffer(io, __i) \
 		for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
 
 /*
- * Return a pointer to the current RS block when called inside
- * fec_for_each_buffer_rs_block.
+ * Return a pointer to the current RS message when called inside
+ * fec_for_each_buffer_rs_message.
  */
-static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
-				      struct dm_verity_fec_io *fio,
-				      unsigned int i, unsigned int j)
+static inline u8 *fec_buffer_rs_message(struct dm_verity *v,
+					struct dm_verity_fec_io *fio,
+					unsigned int i, unsigned int j)
 {
-	return &fio->bufs[i][j * v->fec->rsn];
+	return &fio->bufs[i][j * v->fec->rs_k];
 }
 
 /*
- * Return an index to the current RS block when called inside
- * fec_for_each_buffer_rs_block.
- */
-static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j)
-{
-	return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
-}
-
-/*
- * Decode all RS blocks from buffers and copy corrected bytes into fio->output
- * starting from block_offset.
+ * Decode all RS codewords whose message bytes were loaded into fio->bufs.  Copy
+ * the corrected bytes into fio->output starting from out_pos.
  */
 static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
-			   struct dm_verity_fec_io *fio, u64 rsb, int byte_index,
-			   unsigned int block_offset, int neras)
+			   struct dm_verity_fec_io *fio, u64 target_block,
+			   unsigned int target_region, u64 index_in_region,
+			   unsigned int out_pos, int neras)
 {
-	int r, corrected = 0, res;
+	int r = 0, corrected = 0, res;
 	struct dm_buffer *buf;
-	unsigned int n, i, j, offset, par_buf_offset = 0;
-	uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
-	u8 *par, *block;
+	unsigned int n, i, j, parity_pos, to_copy;
+	uint16_t par_buf[DM_VERITY_FEC_MAX_ROOTS];
+	u8 *par, *msg_buf;
+	u64 parity_block;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
-	par = fec_read_parity(v, rsb, block_offset, &offset,
-			      par_buf_offset, &buf, bio->bi_ioprio);
-	if (IS_ERR(par))
+	/*
+	 * Compute the index of the first parity block that will be needed and
+	 * the starting position in that block.  Then read that block.
+	 *
+	 * block_size is always a power of 2, but roots might not be.  Note that
+	 * when it's not, a codeword's parity bytes can span a block boundary.
+	 */
+	parity_block = ((index_in_region << v->data_dev_block_bits) + out_pos) *
+		       v->fec->roots;
+	parity_pos = parity_block & (v->fec->block_size - 1);
+	parity_block >>= v->data_dev_block_bits;
+	par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf,
+					bio->bi_ioprio);
+	if (IS_ERR(par)) {
+		DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+		      v->data_dev->name, target_block, parity_block,
+		      PTR_ERR(par));
 		return PTR_ERR(par);
+	}
 
 	/*
-	 * Decode the RS blocks we have in bufs. Each RS block results in
-	 * one corrected target byte and consumes fec->roots parity bytes.
+	 * Decode the RS codewords whose message bytes are in bufs. Each RS
+	 * codeword results in one corrected target byte and consumes fec->roots
+	 * parity bytes.
 	 */
-	fec_for_each_buffer_rs_block(fio, n, i) {
-		block = fec_buffer_rs_block(v, fio, n, i);
-		for (j = 0; j < v->fec->roots - par_buf_offset; j++)
-			par_buf[par_buf_offset + j] = par[offset + j];
-		/* Decode an RS block using Reed-Solomon */
-		res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn,
+	fec_for_each_buffer_rs_message(fio, n, i) {
+		msg_buf = fec_buffer_rs_message(v, fio, n, i);
+
+		/*
+		 * Copy the next 'roots' parity bytes to 'par_buf', reading
+		 * another parity block if needed.
+		 */
+		to_copy = min(v->fec->block_size - parity_pos, v->fec->roots);
+		for (j = 0; j < to_copy; j++)
+			par_buf[j] = par[parity_pos++];
+		if (to_copy < v->fec->roots) {
+			parity_block++;
+			parity_pos = 0;
+
+			dm_bufio_release(buf);
+			par = dm_bufio_read_with_ioprio(v->fec->bufio,
+							parity_block, &buf,
+							bio->bi_ioprio);
+			if (IS_ERR(par)) {
+				DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+				      v->data_dev->name, target_block,
+				      parity_block, PTR_ERR(par));
+				return PTR_ERR(par);
+			}
+			for (; j < v->fec->roots; j++)
+				par_buf[j] = par[parity_pos++];
+		}
+
+		/* Decode an RS codeword using the Reed-Solomon library. */
+		res = decode_rs8(fio->rs, msg_buf, par_buf, v->fec->rs_k,
 				 NULL, neras, fio->erasures, 0, NULL);
 		if (res < 0) {
 			r = res;
-			goto error;
+			goto done;
 		}
-
 		corrected += res;
-		fio->output[block_offset] = block[byte_index];
+		fio->output[out_pos++] = msg_buf[target_region];
 
-		block_offset++;
-		if (block_offset >= 1 << v->data_dev_block_bits)
+		if (out_pos >= v->fec->block_size)
 			goto done;
-
-		/* Read the next block when we run out of parity bytes */
-		offset += (v->fec->roots - par_buf_offset);
-		/* Check if parity bytes are split between blocks */
-		if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) {
-			par_buf_offset = v->fec->io_size - offset;
-			for (j = 0; j < par_buf_offset; j++)
-				par_buf[j] = par[offset + j];
-			offset += par_buf_offset;
-		} else
-			par_buf_offset = 0;
-
-		if (offset >= v->fec->io_size) {
-			dm_bufio_release(buf);
-
-			par = fec_read_parity(v, rsb, block_offset, &offset,
-					      par_buf_offset, &buf, bio->bi_ioprio);
-			if (IS_ERR(par))
-				return PTR_ERR(par);
-		}
 	}
 done:
-	r = corrected;
-error:
 	dm_bufio_release(buf);
 
 	if (r < 0 && neras)
 		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
-			    v->data_dev->name, (unsigned long long)rsb, r);
-	else if (r > 0) {
+			    v->data_dev->name, target_block, r);
+	else if (r == 0)
 		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
-			     v->data_dev->name, (unsigned long long)rsb, r);
-		atomic64_inc(&v->fec->corrected);
-	}
+			     v->data_dev->name, target_block, corrected);
 
 	return r;
 }
@@ -178,7 +146,7 @@ error:
 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 			  const u8 *want_digest, const u8 *data)
 {
-	if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits,
+	if (unlikely(verity_hash(v, io, data, v->fec->block_size,
 				 io->tmp_digest)))
 		return 0;
 
@@ -186,22 +154,35 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 }
 
 /*
- * Read data blocks that are part of the RS block and deinterleave as much as
- * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ * Read the message block at index @index_in_region within each of the
+ * @v->fec->rs_k regions and deinterleave their contents into @io->fec_io->bufs.
+ *
+ * @target_block gives the index of specific block within this sequence that is
+ * being corrected, relative to the start of all the FEC message blocks.
+ *
+ * @out_pos gives the current output position, i.e. the position in (each) block
+ * from which to start the deinterleaving.  Deinterleaving continues until
+ * either end-of-block is reached or there's no more buffer space.
+ *
+ * If @neras is non-NULL, then also use verity hashes and the presence/absence
+ * of I/O errors to determine which of the message blocks in the sequence are
+ * likely to be incorrect.  Write the number of such blocks to *@neras and the
+ * indices of the corresponding RS message bytes in [0, k - 1] to
+ * @io->fec_io->erasures, up to a limit of @v->fec->roots + 1 such blocks.
  */
 static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
-			 u64 rsb, u64 target, unsigned int block_offset,
-			 int *neras)
+			 u64 target_block, u64 index_in_region,
+			 unsigned int out_pos, int *neras)
 {
 	bool is_zero;
-	int i, j, target_index = -1;
+	int i, j;
 	struct dm_buffer *buf;
 	struct dm_bufio_client *bufio;
 	struct dm_verity_fec_io *fio = io->fec_io;
-	u64 block, ileaved;
-	u8 *bbuf, *rs_block;
+	u64 block;
+	u8 *bbuf;
 	u8 want_digest[HASH_MAX_DIGESTSIZE];
-	unsigned int n, k;
+	unsigned int n, src_pos;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	if (neras)
@@ -210,21 +191,12 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 	if (WARN_ON(v->digest_size > sizeof(want_digest)))
 		return -EINVAL;
 
-	/*
-	 * read each of the rsn data blocks that are part of the RS block, and
-	 * interleave contents to available bufs
-	 */
-	for (i = 0; i < v->fec->rsn; i++) {
-		ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
-
+	for (i = 0; i < v->fec->rs_k; i++) {
 		/*
-		 * target is the data block we want to correct, target_index is
-		 * the index of this block within the rsn RS blocks
+		 * Read the block from region i.  It contains the i'th message
+		 * byte of the target block's RS codewords.
 		 */
-		if (ileaved == target)
-			target_index = i;
-
-		block = ileaved >> v->data_dev_block_bits;
+		block = i * v->fec->region_blocks + index_in_region;
 		bufio = v->fec->data_bufio;
 
 		if (block >= v->data_blocks) {
@@ -244,9 +216,8 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 		bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio);
 		if (IS_ERR(bbuf)) {
 			DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
-				     v->data_dev->name,
-				     (unsigned long long)rsb,
-				     (unsigned long long)block, PTR_ERR(bbuf));
+				     v->data_dev->name, target_block, block,
+				     PTR_ERR(bbuf));
 
 			/* assume the block is corrupted */
 			if (neras && *neras <= v->fec->roots)
@@ -273,23 +244,20 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
 		}
 
 		/*
-		 * deinterleave and copy the bytes that fit into bufs,
-		 * starting from block_offset
+		 * Deinterleave the bytes of the block, starting from 'out_pos',
+		 * into the i'th byte of the RS message buffers.  Stop when
+		 * end-of-block is reached or there are no more buffers.
 		 */
-		fec_for_each_buffer_rs_block(fio, n, j) {
-			k = fec_buffer_rs_index(n, j) + block_offset;
-
-			if (k >= 1 << v->data_dev_block_bits)
+		src_pos = out_pos;
+		fec_for_each_buffer_rs_message(fio, n, j) {
+			if (src_pos >= v->fec->block_size)
 				goto done;
-
-			rs_block = fec_buffer_rs_block(v, fio, n, j);
-			rs_block[i] = bbuf[k];
+			fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[src_pos++];
 		}
 done:
 		dm_bufio_release(buf);
 	}
-
-	return target_index;
+	return 0;
 }
 
 /*
@@ -336,47 +304,65 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 	unsigned int n;
 
 	fec_for_each_buffer(fio, n)
-		memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+		memset(fio->bufs[n], 0, v->fec->rs_k << DM_VERITY_FEC_BUF_RS_BITS);
 
 	memset(fio->erasures, 0, sizeof(fio->erasures));
 }
 
 /*
- * Decode all RS blocks in a single data block and return the target block
- * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
- * hashes to locate erasures.
+ * Try to correct the message (data or hash) block at index @target_block.
+ *
+ * If @use_erasures is true, use verity hashes to locate erasures.  This makes
+ * the error correction slower but up to twice as capable.
+ *
+ * On success, return 0 and write the corrected block to @fio->output.  0 is
+ * returned only if the digest of the corrected block matches @want_digest; this
+ * is critical to ensure that FEC can't cause dm-verity to return bad data.
  */
-static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
-			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
-			  const u8 *want_digest, bool use_erasures)
+static int fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+		      struct dm_verity_fec_io *fio, u64 target_block,
+		      const u8 *want_digest, bool use_erasures)
 {
 	int r, neras = 0;
-	unsigned int pos;
+	unsigned int target_region, out_pos;
+	u64 index_in_region;
 
-	for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+	/*
+	 * Compute 'target_region', the index of the region the target block is
+	 * in; and 'index_in_region', the index of the target block within its
+	 * region.  The latter value is also the index within its region of each
+	 * message block that shares its RS codewords with the target block.
+	 */
+	target_region = div64_u64_rem(target_block, v->fec->region_blocks,
+				      &index_in_region);
+	if (WARN_ON_ONCE(target_region >= v->fec->rs_k))
+		/* target_block is out-of-bounds.  Should never happen. */
+		return -EIO;
+
+	for (out_pos = 0; out_pos < v->fec->block_size;) {
 		fec_init_bufs(v, fio);
 
-		r = fec_read_bufs(v, io, rsb, offset, pos,
+		r = fec_read_bufs(v, io, target_block, index_in_region, out_pos,
 				  use_erasures ? &neras : NULL);
 		if (unlikely(r < 0))
 			return r;
 
-		r = fec_decode_bufs(v, io, fio, rsb, r, pos, neras);
+		r = fec_decode_bufs(v, io, fio, target_block, target_region,
+				    index_in_region, out_pos, neras);
 		if (r < 0)
 			return r;
 
-		pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+		out_pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
 	}
 
 	/* Always re-validate the corrected block against the expected hash */
-	r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits,
-			io->tmp_digest);
+	r = verity_hash(v, io, fio->output, v->fec->block_size, io->tmp_digest);
 	if (unlikely(r < 0))
 		return r;
 
 	if (memcmp(io->tmp_digest, want_digest, v->digest_size)) {
 		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
-			    v->data_dev->name, (unsigned long long)rsb, neras);
+			    v->data_dev->name, target_block, neras);
 		return -EILSEQ;
 	}
 
@@ -390,7 +376,6 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 {
 	int r;
 	struct dm_verity_fec_io *fio;
-	u64 offset, res, rsb;
 
 	if (!verity_fec_is_enabled(v))
 		return -EOPNOTSUPP;
@@ -408,37 +393,19 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 		block = block - v->hash_start + v->data_blocks;
 
 	/*
-	 * For RS(M, N), the continuous FEC data is divided into blocks of N
-	 * bytes. Since block size may not be divisible by N, the last block
-	 * is zero padded when decoding.
-	 *
-	 * Each byte of the block is covered by a different RS(M, N) code,
-	 * and each code is interleaved over N blocks to make it less likely
-	 * that bursty corruption will leave us in unrecoverable state.
-	 */
-
-	offset = block << v->data_dev_block_bits;
-	res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits);
-
-	/*
-	 * The base RS block we can feed to the interleaver to find out all
-	 * blocks required for decoding.
-	 */
-	rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits);
-
-	/*
 	 * Locating erasures is slow, so attempt to recover the block without
 	 * them first. Do a second attempt with erasures if the corruption is
 	 * bad enough.
 	 */
-	r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false);
+	r = fec_decode(v, io, fio, block, want_digest, false);
 	if (r < 0) {
-		r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true);
+		r = fec_decode(v, io, fio, block, want_digest, true);
 		if (r < 0)
 			goto done;
 	}
 
-	memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
+	memcpy(dest, fio->output, v->fec->block_size);
+	atomic64_inc(&v->fec->corrected);
 
 done:
 	fio->level--;
@@ -585,8 +552,8 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
 
 	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) {
 		if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c ||
-		    num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) ||
-		    num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) {
+		    num_c < DM_VERITY_FEC_MIN_ROOTS ||
+		    num_c > DM_VERITY_FEC_MAX_ROOTS) {
 			ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS;
 			return -EINVAL;
 		}
@@ -625,7 +592,7 @@ int verity_fec_ctr(struct dm_verity *v)
 {
 	struct dm_verity_fec *f = v->fec;
 	struct dm_target *ti = v->ti;
-	u64 hash_blocks, fec_blocks;
+	u64 hash_blocks;
 	int ret;
 
 	if (!verity_fec_is_enabled(v)) {
@@ -648,7 +615,7 @@ int verity_fec_ctr(struct dm_verity *v)
 	 * hash device after the hash blocks.
 	 */
 
-	hash_blocks = v->hash_blocks - v->hash_start;
+	hash_blocks = v->hash_end - v->hash_start;
 
 	/*
 	 * Require matching block sizes for data and hash devices for
@@ -658,27 +625,28 @@ int verity_fec_ctr(struct dm_verity *v)
 		ti->error = "Block sizes must match to use FEC";
 		return -EINVAL;
 	}
+	f->block_size = 1 << v->data_dev_block_bits;
 
 	if (!f->roots) {
 		ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS;
 		return -EINVAL;
 	}
-	f->rsn = DM_VERITY_FEC_RSM - f->roots;
+	f->rs_k = DM_VERITY_FEC_RS_N - f->roots;
 
 	if (!f->blocks) {
 		ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS;
 		return -EINVAL;
 	}
 
-	f->rounds = f->blocks;
-	if (sector_div(f->rounds, f->rsn))
-		f->rounds++;
+	f->region_blocks = f->blocks;
+	if (sector_div(f->region_blocks, f->rs_k))
+		f->region_blocks++;
 
 	/*
 	 * Due to optional metadata, f->blocks can be larger than
 	 * data_blocks and hash_blocks combined.
 	 */
-	if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) {
+	if (f->blocks < v->data_blocks + hash_blocks || !f->region_blocks) {
 		ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
 		return -EINVAL;
 	}
@@ -688,16 +656,14 @@ int verity_fec_ctr(struct dm_verity *v)
 	 * it to be large enough.
 	 */
 	f->hash_blocks = f->blocks - v->data_blocks;
-	if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) {
+	if (dm_bufio_get_device_size(v->bufio) <
+	    v->hash_start + f->hash_blocks) {
 		ti->error = "Hash device is too small for "
 			DM_VERITY_OPT_FEC_BLOCKS;
 		return -E2BIG;
 	}
 
-	f->io_size = 1 << v->data_dev_block_bits;
-
-	f->bufio = dm_bufio_client_create(f->dev->bdev,
-					  f->io_size,
+	f->bufio = dm_bufio_client_create(f->dev->bdev, f->block_size,
 					  1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->bufio)) {
 		ti->error = "Cannot initialize FEC bufio client";
@@ -706,14 +672,12 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT));
 
-	fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT);
-	if (dm_bufio_get_device_size(f->bufio) < fec_blocks) {
+	if (dm_bufio_get_device_size(f->bufio) < f->region_blocks * f->roots) {
 		ti->error = "FEC device is too small";
 		return -E2BIG;
 	}
 
-	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
-					       1 << v->data_dev_block_bits,
+	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, f->block_size,
 					       1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->data_bufio)) {
 		ti->error = "Cannot initialize FEC data bufio client";
@@ -743,7 +707,7 @@ int verity_fec_ctr(struct dm_verity *v)
 	}
 
 	f->cache = kmem_cache_create("dm_verity_fec_buffers",
-				     f->rsn << DM_VERITY_FEC_BUF_RS_BITS,
+				     f->rs_k << DM_VERITY_FEC_BUF_RS_BITS,
 				     0, 0, NULL);
 	if (!f->cache) {
 		ti->error = "Cannot create FEC buffer cache";
@@ -760,7 +724,7 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	/* Preallocate an output buffer for each thread */
 	ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(),
-					1 << v->data_dev_block_bits);
+					f->block_size);
 	if (ret) {
 		ti->error = "Cannot allocate FEC output pool";
 		return ret;
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 35d28d9f8a9b..50b5e187d5cc 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -11,13 +11,13 @@
 #include "dm-verity.h"
 #include <linux/rslib.h>
 
-/* Reed-Solomon(M, N) parameters */
-#define DM_VERITY_FEC_RSM		255
-#define DM_VERITY_FEC_MAX_RSN		253
-#define DM_VERITY_FEC_MIN_RSN		231	/* ~10% space overhead */
+/* Reed-Solomon(n, k) parameters */
+#define DM_VERITY_FEC_RS_N	255
+#define DM_VERITY_FEC_MIN_ROOTS	2	/* RS(255, 253): ~0.8% space overhead */
+#define DM_VERITY_FEC_MAX_ROOTS	24	/* RS(255, 231): ~10% space overhead */
 
 /* buffers for deinterleaving and decoding */
-#define DM_VERITY_FEC_BUF_RS_BITS	4	/* 1 << RS blocks per buffer */
+#define DM_VERITY_FEC_BUF_RS_BITS	4 /* log2(RS messages per buffer) */
 
 #define DM_VERITY_OPT_FEC_DEV		"use_fec_from_device"
 #define DM_VERITY_OPT_FEC_BLOCKS	"fec_blocks"
@@ -29,13 +29,13 @@ struct dm_verity_fec {
 	struct dm_dev *dev;	/* parity data device */
 	struct dm_bufio_client *data_bufio;	/* for data dev access */
 	struct dm_bufio_client *bufio;		/* for parity data access */
-	size_t io_size;		/* IO size for roots */
+	size_t block_size;	/* size of data, hash, and parity blocks in bytes */
 	sector_t start;		/* parity data start in blocks */
 	sector_t blocks;	/* number of blocks covered */
-	sector_t rounds;	/* number of interleaving rounds */
+	sector_t region_blocks; /* blocks per region: ceil(blocks / rs_k) */
 	sector_t hash_blocks;	/* blocks covered after v->hash_start */
-	unsigned char roots;	/* number of parity bytes, M-N of RS(M, N) */
-	unsigned char rsn;	/* N of RS(M, N) */
+	unsigned char roots;	/* parity bytes per RS codeword, n-k of RS(n, k) */
+	unsigned char rs_k;	/* message bytes per RS codeword, k of RS(n, k) */
 	mempool_t fio_pool;	/* mempool for dm_verity_fec_io */
 	mempool_t rs_pool;	/* mempool for fio->rs */
 	mempool_t prealloc_pool;	/* mempool for preallocated buffers */
@@ -47,15 +47,15 @@ struct dm_verity_fec {
 /* per-bio data */
 struct dm_verity_fec_io {
 	struct rs_control *rs;	/* Reed-Solomon state */
-	int erasures[DM_VERITY_FEC_MAX_RSN];	/* erasures for decode_rs8 */
+	int erasures[DM_VERITY_FEC_MAX_ROOTS + 1]; /* erasures for decode_rs8 */
 	u8 *output;		/* buffer for corrected output */
 	unsigned int level;		/* recursion level */
 	unsigned int nbufs;		/* number of buffers allocated */
 	/*
-	 * Buffers for deinterleaving RS blocks.  Each buffer has space for
-	 * the data bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS blocks.  The
-	 * array length is fec_max_nbufs(v), and we try to allocate that many
-	 * buffers.  However, in low-memory situations we may be unable to
+	 * Buffers for deinterleaving RS codewords.  Each buffer has space for
+	 * the message bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS codewords.
+	 * The array length is fec_max_nbufs(v), and we try to allocate that
+	 * many buffers.  However, in low-memory situations we may be unable to
 	 * allocate all buffers.  'nbufs' holds the number actually allocated.
 	 */
 	u8 *bufs[];
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 61073cd01d13..9a9847f94c46 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -733,8 +733,8 @@ static void verity_prefetch_io(struct work_struct *work)
 
 			hash_block_start &= ~(sector_t)(cluster - 1);
 			hash_block_end |= cluster - 1;
-			if (unlikely(hash_block_end >= v->hash_blocks))
-				hash_block_end = v->hash_blocks - 1;
+			if (unlikely(hash_block_end >= v->hash_end))
+				hash_block_end = v->hash_end - 1;
 		}
 no_prefetch_cluster:
 		dm_bufio_prefetch_with_ioprio(v->bufio, hash_block_start,
@@ -1011,13 +1011,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct dm_verity *v = ti->private;
 
-	if (limits->logical_block_size < 1 << v->data_dev_block_bits)
-		limits->logical_block_size = 1 << v->data_dev_block_bits;
-
-	if (limits->physical_block_size < 1 << v->data_dev_block_bits)
-		limits->physical_block_size = 1 << v->data_dev_block_bits;
-
-	limits->io_min = limits->logical_block_size;
+	dm_stack_bs_limits(limits, 1 << v->data_dev_block_bits);
 
 	/*
 	 * Similar to what dm-crypt does, opt dm-verity out of support for
@@ -1607,7 +1601,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		}
 		hash_position += s;
 	}
-	v->hash_blocks = hash_position;
+	v->hash_end = hash_position;
 
 	r = mempool_init_page_pool(&v->recheck_pool, 1, 0);
 	if (unlikely(r)) {
@@ -1634,7 +1628,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
+	if (dm_bufio_get_device_size(v->bufio) < v->hash_end) {
 		ti->error = "Hash device is too small";
 		r = -E2BIG;
 		goto bad;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index d6bfabb27113..2922263501f6 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -53,9 +53,9 @@ struct dm_verity {
 	unsigned int sig_size;	/* root digest signature size */
 #endif /* CONFIG_SECURITY */
 	unsigned int salt_size;
-	sector_t hash_start;	/* hash start in blocks */
+	sector_t hash_start;	/* index of first hash block on hash_dev */
+	sector_t hash_end;	/* 1 + index of last hash block on hash dev */
 	sector_t data_blocks;	/* the number of data blocks */
-	sector_t hash_blocks;	/* the number of hash blocks */
 	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
 	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
 	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 98bd945f6da7..493f5202ad04 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1640,17 +1640,9 @@ static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limit
 {
 	struct dm_writecache *wc = ti->private;
 
-	if (limits->logical_block_size < wc->block_size)
-		limits->logical_block_size = wc->block_size;
-
-	if (limits->physical_block_size < wc->block_size)
-		limits->physical_block_size = wc->block_size;
-
-	if (limits->io_min < wc->block_size)
-		limits->io_min = wc->block_size;
+	dm_stack_bs_limits(limits, wc->block_size);
 }
 
-
 static void writecache_writeback_endio(struct bio *bio)
 {
 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
index bf398d7476b3..9e7e6b1a6f15 100644
--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -208,6 +208,20 @@ enum llbitmap_state {
 	BitNeedSync,
 	/* data is synchronizing */
 	BitSyncing,
+	/*
+	 * Proactive sync requested for unwritten region (raid456 only).
+	 * Triggered via sysfs when user wants to pre-build XOR parity
+	 * for regions that have never been written.
+	 */
+	BitNeedSyncUnwritten,
+	/* Proactive sync in progress for unwritten region */
+	BitSyncingUnwritten,
+	/*
+	 * XOR parity has been pre-built for a region that has never had
+	 * user data written. When user writes to this region, it transitions
+	 * to BitDirty.
+	 */
+	BitCleanUnwritten,
 	BitStateCount,
 	BitNone = 0xff,
 };
@@ -232,6 +246,12 @@ enum llbitmap_action {
 	 * BitNeedSync.
 	 */
 	BitmapActionStale,
+	/*
+	 * Proactive sync trigger for raid456 - builds XOR parity for
+	 * Unwritten regions without requiring user data write first.
+	 */
+	BitmapActionProactiveSync,
+	BitmapActionClearUnwritten,
 	BitmapActionCount,
 	/* Init state is BitUnwritten */
 	BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitNone,
 		[BitmapActionStale]		= BitNone,
+		[BitmapActionProactiveSync]	= BitNeedSyncUnwritten,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitClean] = {
 		[BitmapActionStartwrite]	= BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitDirty] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitClean,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitNeedSync] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNone,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitSyncing] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
+	},
+	[BitNeedSyncUnwritten] = {
+		[BitmapActionStartwrite]	= BitNeedSync,
+		[BitmapActionStartsync]		= BitSyncingUnwritten,
+		[BitmapActionEndsync]		= BitNone,
+		[BitmapActionAbortsync]		= BitUnwritten,
+		[BitmapActionReload]		= BitUnwritten,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
+	},
+	[BitSyncingUnwritten] = {
+		[BitmapActionStartwrite]	= BitSyncing,
+		[BitmapActionStartsync]		= BitSyncingUnwritten,
+		[BitmapActionEndsync]		= BitCleanUnwritten,
+		[BitmapActionAbortsync]		= BitUnwritten,
+		[BitmapActionReload]		= BitUnwritten,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
+	},
+	[BitCleanUnwritten] = {
+		[BitmapActionStartwrite]	= BitDirty,
+		[BitmapActionStartsync]		= BitNone,
+		[BitmapActionEndsync]		= BitNone,
+		[BitmapActionAbortsync]		= BitNone,
+		[BitmapActionReload]		= BitNone,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
 	},
 };
 
@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
 			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
 			break;
 		case BitClean:
+		case BitCleanUnwritten:
 			pctl->state[pos] = BitDirty;
 			break;
 		}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
 }
 
 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
-				    int offset)
+				    int offset, bool infect)
 {
 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
 	unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
 	 * resync all the dirty bits, hence skip infect new dirty bits to
 	 * prevent resync unnecessary data.
 	 */
-	if (llbitmap->mddev->degraded) {
+	if (llbitmap->mddev->degraded || !infect) {
 		set_bit(block, pctl->dirty);
 		return;
 	}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
 
 	llbitmap->pctl[idx]->state[bit] = state;
 	if (state == BitDirty || state == BitNeedSync)
-		llbitmap_set_page_dirty(llbitmap, idx, bit);
+		llbitmap_set_page_dirty(llbitmap, idx, bit, true);
+	else if (state == BitNeedSyncUnwritten)
+		llbitmap_set_page_dirty(llbitmap, idx, bit, false);
 }
 
 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -459,7 +528,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
 	rdev_for_each(rdev, mddev) {
 		sector_t sector;
 
-		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) ||
+		    !test_bit(In_sync, &rdev->flags))
 			continue;
 
 		sector = mddev->bitmap_info.offset +
@@ -584,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
 	return 0;
 }
 
+/*
+ * Check if all underlying disks support write_zeroes with unmap.
+ */
+static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
+{
+	struct mddev *mddev = llbitmap->mddev;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, mddev) {
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+			continue;
+
+		if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Issue write_zeroes to all underlying disks to zero their data regions.
+ * This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
+ * Returns true if all disks were successfully zeroed.
+ */
+static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
+{
+	struct mddev *mddev = llbitmap->mddev;
+	struct md_rdev *rdev;
+	sector_t dev_sectors = mddev->dev_sectors;
+	int ret;
+
+	rdev_for_each(rdev, mddev) {
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+			continue;
+
+		ret = blkdev_issue_zeroout(rdev->bdev,
+					   rdev->data_offset,
+					   dev_sectors,
+					   GFP_KERNEL, 0);
+		if (ret) {
+			pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
+				rdev->bdev, ret);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static void llbitmap_init_state(struct llbitmap *llbitmap)
 {
+	struct mddev *mddev = llbitmap->mddev;
 	enum llbitmap_state state = BitUnwritten;
 	unsigned long i;
 
-	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
 		state = BitClean;
+	} else if (raid_is_456(mddev) &&
+		   llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
+		/*
+		 * All disks support write_zeroes with unmap. Zero all disks
+		 * to ensure parity consistency, then set BitCleanUnwritten
+		 * to skip initial sync.
+		 */
+		if (llbitmap_zero_all_disks(llbitmap))
+			state = BitCleanUnwritten;
+	}
 
 	for (i = 0; i < llbitmap->chunks; i++)
 		llbitmap_write(llbitmap, state, i);
@@ -626,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
 			goto write_bitmap;
 		}
 
-		if (c == BitNeedSync)
+		if (c == BitNeedSync || c == BitNeedSyncUnwritten)
 			need_resync = !mddev->degraded;
 
 		state = state_machine[c][action];
-
 write_bitmap:
 		if (unlikely(mddev->degraded)) {
 			/* For degraded array, mark new data as need sync. */
@@ -657,8 +786,7 @@ write_bitmap:
 		}
 
 		llbitmap_write(llbitmap, state, start);
-
-		if (state == BitNeedSync)
+		if (state == BitNeedSync || state == BitNeedSyncUnwritten)
 			need_resync = !mddev->degraded;
 		else if (state == BitDirty &&
 			 !timer_pending(&llbitmap->pending_timer))
@@ -1069,12 +1197,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 
-	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
-
 	while (page_start <= page_end) {
 		llbitmap_raise_barrier(llbitmap, page_start);
 		page_start++;
 	}
+
+	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
 }
 
 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
@@ -1101,12 +1229,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 
-	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
-
 	while (page_start <= page_end) {
 		llbitmap_raise_barrier(llbitmap, page_start);
 		page_start++;
 	}
+
+	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
 }
 
 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
@@ -1228,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
 	unsigned long p = offset >> llbitmap->chunkshift;
 	enum llbitmap_state c = llbitmap_read(llbitmap, p);
 
-	return c == BitClean || c == BitDirty;
+	return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
 }
 
 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1242,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
 	if (c == BitUnwritten)
 		return blocks;
 
+	/* Skip CleanUnwritten - no user data, will be reset after recovery */
+	if (c == BitCleanUnwritten)
+		return blocks;
+
 	/* For degraded array, don't skip */
 	if (mddev->degraded)
 		return 0;
@@ -1260,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
 {
 	struct llbitmap *llbitmap = mddev->bitmap;
 	unsigned long p = offset >> llbitmap->chunkshift;
+	enum llbitmap_state state;
+
+	/*
+	 * Before recovery starts, convert CleanUnwritten to Unwritten.
+	 * This ensures the new disk won't have stale parity data.
+	 */
+	if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
+		llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+				       BitmapActionClearUnwritten);
+
 
 	/*
 	 * Handle one bit at a time, this is much simpler. And it doesn't matter
 	 * if md_do_sync() loop more times.
 	 */
 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
-	return llbitmap_state_machine(llbitmap, p, p,
-				      BitmapActionStartsync) == BitSyncing;
+	state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
+	return state == BitSyncing || state == BitSyncingUnwritten;
 }
 
 /* Something is wrong, sync_thread stop at @offset */
@@ -1473,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
 	}
 
 	mutex_unlock(&mddev->bitmap_info.mutex);
-	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+	return sprintf(page,
+		       "unwritten %d\nclean %d\ndirty %d\n"
+		       "need sync %d\nsyncing %d\n"
+		       "need sync unwritten %d\nsyncing unwritten %d\n"
+		       "clean unwritten %d\n",
 		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
-		       bits[BitNeedSync], bits[BitSyncing]);
+		       bits[BitNeedSync], bits[BitSyncing],
+		       bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
+		       bits[BitCleanUnwritten]);
 }
 
 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1548,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
 
 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
 
+static ssize_t
+proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	struct llbitmap *llbitmap;
+
+	/* Only for RAID-456 */
+	if (!raid_is_456(mddev))
+		return -EINVAL;
+
+	mutex_lock(&mddev->bitmap_info.mutex);
+	llbitmap = mddev->bitmap;
+	if (!llbitmap || !llbitmap->pctl) {
+		mutex_unlock(&mddev->bitmap_info.mutex);
+		return -ENODEV;
+	}
+
+	/* Trigger proactive sync on all Unwritten regions */
+	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+			       BitmapActionProactiveSync);
+
+	mutex_unlock(&mddev->bitmap_info.mutex);
+	return len;
+}
+
+static struct md_sysfs_entry llbitmap_proactive_sync =
+	__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
+
 static struct attribute *md_llbitmap_attrs[] = {
 	&llbitmap_bits.attr,
 	&llbitmap_metadata.attr,
 	&llbitmap_daemon_sleep.attr,
 	&llbitmap_barrier_idle.attr,
+	&llbitmap_proactive_sync.attr,
 	NULL
 };
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3ce6f9e9d38e..5fb5ae8368ba 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
 static const struct kobj_type md_ktype;
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-static struct workqueue_struct *md_wq;
 
 /*
  * This workqueue is used for sync_work to register new sync_thread, and for
@@ -98,7 +97,7 @@ static struct workqueue_struct *md_misc_wq;
 static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
-static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
+static void export_rdev(struct md_rdev *rdev);
 static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
 
 /*
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
 
 		spin_lock_init(&serial_tmp->serial_lock);
 		serial_tmp->serial_rb = RB_ROOT_CACHED;
-		init_waitqueue_head(&serial_tmp->serial_io_wait);
 	}
 
 	rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
 	}
 
 	percpu_ref_kill(&mddev->active_io);
+
+	/*
+	 * RAID456 IO can sleep in wait_for_reshape while still holding an
+	 * active_io reference. If reshape is already interrupted or frozen,
+	 * wake those waiters so they can abort and drop the reference instead
+	 * of deadlocking suspend.
+	 */
+	if (mddev->pers && mddev->pers->prepare_suspend &&
+	    reshape_interrupted(mddev))
+		mddev->pers->prepare_suspend(mddev);
+
 	if (interruptible)
 		err = wait_event_interruptible(mddev->sb_wait,
 				percpu_ref_is_zero(&mddev->active_io));
@@ -959,7 +968,7 @@ void mddev_unlock(struct mddev *mddev)
 	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
 		list_del_init(&rdev->same_set);
 		kobject_del(&rdev->kobj);
-		export_rdev(rdev, mddev);
+		export_rdev(rdev);
 	}
 
 	if (!legacy_async_del_gendisk) {
@@ -2632,7 +2641,7 @@ void md_autodetect_dev(dev_t dev);
 /* just for claiming the bdev */
 static struct md_rdev claim_rdev;
 
-static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
+static void export_rdev(struct md_rdev *rdev)
 {
 	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
 	md_rdev_clear(rdev);
@@ -2788,7 +2797,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
 	if (!md_is_rdwr(mddev)) {
 		if (force_change)
 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
-		pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
+		if (!mddev_is_dm(mddev))
+			pr_err_ratelimited("%s: can't update sb for read-only array %s\n",
+					   __func__, mdname(mddev));
 		return;
 	}
 
@@ -4848,7 +4859,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	err = bind_rdev_to_array(rdev, mddev);
  out:
 	if (err)
-		export_rdev(rdev, mddev);
+		export_rdev(rdev);
 	mddev_unlock_and_resume(mddev);
 	if (!err)
 		md_new_event();
@@ -6128,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	}
 	spin_unlock(&all_mddevs_lock);
 	rv = entry->store(mddev, page, length);
-	mddev_put(mddev);
 
+	/*
+	 * For "array_state=clear", dropping the extra kobject reference from
+	 * sysfs_break_active_protection() can trigger md kobject deletion.
+	 * Restore active protection before mddev_put() so deletion happens
+	 * after the sysfs write path fully unwinds.
+	 */
 	if (kn)
 		sysfs_unbreak_active_protection(kn);
+	mddev_put(mddev);
 
 	return rv;
 }
@@ -6447,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)
 
 static int start_dirty_degraded;
 
+/*
+ * Read bitmap superblock and return the bitmap_id based on disk version.
+ * This is used as fallback when default bitmap version and on-disk version
+ * doesn't match, and mdadm is not the latest version to set bitmap_type.
+ */
+static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+	struct page *sb_page;
+	bitmap_super_t *sb;
+	enum md_submodule_id id = ID_BITMAP_NONE;
+	sector_t sector;
+	u32 version;
+
+	if (!mddev->bitmap_info.offset)
+		return ID_BITMAP_NONE;
+
+	sb_page = alloc_page(GFP_KERNEL);
+	if (!sb_page) {
+		pr_warn("md: %s: failed to allocate memory for bitmap\n",
+			mdname(mddev));
+		return ID_BITMAP_NONE;
+	}
+
+	sector = mddev->bitmap_info.offset;
+
+	rdev_for_each(rdev, mddev) {
+		u32 iosize;
+
+		if (!test_bit(In_sync, &rdev->flags) ||
+		    test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Bitmap_sync, &rdev->flags))
+			continue;
+
+		iosize = roundup(sizeof(bitmap_super_t),
+				 bdev_logical_block_size(rdev->bdev));
+		if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
+				 true))
+			goto read_ok;
+	}
+	pr_warn("md: %s: failed to read bitmap from any device\n",
+		mdname(mddev));
+	goto out;
+
+read_ok:
+	sb = kmap_local_page(sb_page);
+	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+		pr_warn("md: %s: invalid bitmap magic 0x%x\n",
+			mdname(mddev), le32_to_cpu(sb->magic));
+		goto out_unmap;
+	}
+
+	version = le32_to_cpu(sb->version);
+	switch (version) {
+	case BITMAP_MAJOR_LO:
+	case BITMAP_MAJOR_HI:
+	case BITMAP_MAJOR_CLUSTERED:
+		id = ID_BITMAP;
+		break;
+	case BITMAP_MAJOR_LOCKLESS:
+		id = ID_LLBITMAP;
+		break;
+	default:
+		pr_warn("md: %s: unknown bitmap version %u\n",
+			mdname(mddev), version);
+		break;
+	}
+
+out_unmap:
+	kunmap_local(sb);
+out:
+	__free_page(sb_page);
+	return id;
+}
+
 static int md_bitmap_create(struct mddev *mddev)
 {
+	enum md_submodule_id orig_id = mddev->bitmap_id;
+	enum md_submodule_id sb_id;
+	int err;
+
 	if (mddev->bitmap_id == ID_BITMAP_NONE)
 		return -EINVAL;
 
 	if (!mddev_set_bitmap_ops(mddev))
 		return -ENOENT;
 
-	return mddev->bitmap_ops->create(mddev);
+	err = mddev->bitmap_ops->create(mddev);
+	if (!err)
+		return 0;
+
+	/*
+	 * Create failed, if default bitmap version and on-disk version
+	 * doesn't match, and mdadm is not the latest version to set
+	 * bitmap_type, set bitmap_ops based on the disk version.
+	 */
+	mddev_clear_bitmap_ops(mddev);
+
+	sb_id = md_bitmap_get_id_from_sb(mddev);
+	if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
+		return err;
+
+	pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
+		mdname(mddev), orig_id, sb_id);
+
+	mddev->bitmap_id = sb_id;
+	if (!mddev_set_bitmap_ops(mddev)) {
+		mddev->bitmap_id = orig_id;
+		return -ENOENT;
+	}
+
+	err = mddev->bitmap_ops->create(mddev);
+	if (err) {
+		mddev_clear_bitmap_ops(mddev);
+		mddev->bitmap_id = orig_id;
+	}
+
+	return err;
 }
 
 static void md_bitmap_destroy(struct mddev *mddev)
@@ -7140,7 +7266,7 @@ static void autorun_devices(int part)
 			rdev_for_each_list(rdev, tmp, &candidates) {
 				list_del_init(&rdev->same_set);
 				if (bind_rdev_to_array(rdev, mddev))
-					export_rdev(rdev, mddev);
+					export_rdev(rdev);
 			}
 			autorun_array(mddev);
 			mddev_unlock_and_resume(mddev);
@@ -7150,7 +7276,7 @@ static void autorun_devices(int part)
 		 */
 		rdev_for_each_list(rdev, tmp, &candidates) {
 			list_del_init(&rdev->same_set);
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 		}
 		mddev_put(mddev);
 	}
@@ -7338,13 +7464,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				pr_warn("md: %pg has different UUID to %pg\n",
 					rdev->bdev,
 					rdev0->bdev);
-				export_rdev(rdev, mddev);
+				export_rdev(rdev);
 				return -EINVAL;
 			}
 		}
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 		return err;
 	}
 
@@ -7387,7 +7513,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 			/* This was a hot-add request, but events doesn't
 			 * match, so reject it.
 			 */
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 			return -EINVAL;
 		}
 
@@ -7413,7 +7539,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				}
 			}
 			if (has_journal || mddev->bitmap) {
-				export_rdev(rdev, mddev);
+				export_rdev(rdev);
 				return -EBUSY;
 			}
 			set_bit(Journal, &rdev->flags);
@@ -7428,7 +7554,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				/* --add initiated by this node */
 				err = mddev->cluster_ops->add_new_disk(mddev, rdev);
 				if (err) {
-					export_rdev(rdev, mddev);
+					export_rdev(rdev);
 					return err;
 				}
 			}
@@ -7438,7 +7564,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 		err = bind_rdev_to_array(rdev, mddev);
 
 		if (err)
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 
 		if (mddev_is_clustered(mddev)) {
 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
@@ -7501,7 +7627,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 			return err;
 		}
 	}
@@ -7613,7 +7739,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	return 0;
 
 abort_export:
-	export_rdev(rdev, mddev);
+	export_rdev(rdev);
 	return err;
 }
 
@@ -10503,10 +10629,6 @@ static int __init md_init(void)
 		goto err_bitmap;
 
 	ret = -ENOMEM;
-	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
-	if (!md_wq)
-		goto err_wq;
-
 	md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
 	if (!md_misc_wq)
 		goto err_misc_wq;
@@ -10531,8 +10653,6 @@ err_mdp:
 err_md:
 	destroy_workqueue(md_misc_wq);
 err_misc_wq:
-	destroy_workqueue(md_wq);
-err_wq:
 	md_llbitmap_exit();
 err_bitmap:
 	md_bitmap_exit();
@@ -10841,7 +10961,6 @@ static __exit void md_exit(void)
 	spin_unlock(&all_mddevs_lock);
 
 	destroy_workqueue(md_misc_wq);
-	destroy_workqueue(md_wq);
 	md_bitmap_exit();
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ac84289664cd..d6f5482e2479 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -126,7 +126,6 @@ enum sync_action {
 struct serial_in_rdev {
 	struct rb_root_cached serial_rb;
 	spinlock_t serial_lock;
-	wait_queue_head_t serial_io_wait;
 };
 
 /*
@@ -381,7 +380,11 @@ struct serial_info {
 	struct rb_node node;
 	sector_t start;		/* start sector of rb node */
 	sector_t last;		/* end sector of rb node */
+	sector_t wnode_start; /* address of waiting nodes on the same list */
 	sector_t _subtree_last; /* highest sector in subtree of rb node */
+	struct list_head	list_node;
+	struct list_head	waiters;
+	struct completion	ready;
 };
 
 /*
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ef0045db409f..5e38a51e349a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	}
 
 	err = -ENOMEM;
-	conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
+	conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
 	if (!conf->strip_zone)
 		goto abort;
-	conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
-					    conf->nr_strip_zones,
-					    mddev->raid_disks),
-				GFP_KERNEL);
+	conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
+					     conf->nr_strip_zones,
+					     mddev->raid_disks),
+				 GFP_KERNEL);
 	if (!conf->devlist)
 		goto abort;
 
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 
 	return 0;
 abort:
-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
+	kvfree(conf->strip_zone);
+	kvfree(conf->devlist);
 	kfree(conf);
 	*private_conf = ERR_PTR(err);
 	return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
 {
 	struct r0conf *conf = priv;
 
-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
+	kvfree(conf->strip_zone);
+	kvfree(conf->devlist);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 181400e147c0..ba91f7e61920 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
 		     START, LAST, static inline, raid1_rb);
 
 static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
-				struct serial_info *si, int idx)
+				struct serial_info *si)
 {
 	unsigned long flags;
 	int ret = 0;
 	sector_t lo = r1_bio->sector;
-	sector_t hi = lo + r1_bio->sectors;
+	sector_t hi = lo + r1_bio->sectors - 1;
+	int idx = sector_to_idx(r1_bio->sector);
 	struct serial_in_rdev *serial = &rdev->serial[idx];
+	struct serial_info *head_si;
 
 	spin_lock_irqsave(&serial->serial_lock, flags);
 	/* collision happened */
-	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+	head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+	if (head_si && head_si != si) {
+		si->start = lo;
+		si->last = hi;
+		si->wnode_start = head_si->wnode_start;
+		list_add_tail(&si->list_node, &head_si->waiters);
 		ret = -EBUSY;
-	else {
+	} else if (!head_si) {
 		si->start = lo;
 		si->last = hi;
+		si->wnode_start = si->start;
 		raid1_rb_insert(si, &serial->serial_rb);
 	}
 	spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
 {
 	struct mddev *mddev = rdev->mddev;
 	struct serial_info *si;
-	int idx = sector_to_idx(r1_bio->sector);
-	struct serial_in_rdev *serial = &rdev->serial[idx];
 
 	if (WARN_ON(!mddev->serial_info_pool))
 		return;
 	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
-	wait_event(serial->serial_io_wait,
-		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+	INIT_LIST_HEAD(&si->waiters);
+	INIT_LIST_HEAD(&si->list_node);
+	init_completion(&si->ready);
+	while (check_and_add_serial(rdev, r1_bio, si)) {
+		wait_for_completion(&si->ready);
+		reinit_completion(&si->ready);
+	}
 }
 
 static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct serial_info *si;
+	struct serial_info *si, *iter_si;
 	unsigned long flags;
 	int found = 0;
 	struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
 	     si; si = raid1_rb_iter_next(si, lo, hi)) {
 		if (si->start == lo && si->last == hi) {
-			raid1_rb_remove(si, &serial->serial_rb);
-			mempool_free(si, mddev->serial_info_pool);
 			found = 1;
 			break;
 		}
 	}
-	if (!found)
+	if (found) {
+		raid1_rb_remove(si, &serial->serial_rb);
+		if (!list_empty(&si->waiters)) {
+			list_for_each_entry(iter_si, &si->waiters, list_node) {
+				if (iter_si->wnode_start == si->wnode_start) {
+					list_del_init(&iter_si->list_node);
+					list_splice_init(&si->waiters, &iter_si->waiters);
+					raid1_rb_insert(iter_si, &serial->serial_rb);
+					complete(&iter_si->ready);
+					break;
+				}
+			}
+		}
+		mempool_free(si, mddev->serial_info_pool);
+	} else {
 		WARN(1, "The write IO is not recorded for serialization\n");
+	}
 	spin_unlock_irqrestore(&serial->serial_lock, flags);
-	wake_up(&serial->serial_io_wait);
 }
 
 /*
@@ -452,7 +475,7 @@ static void raid1_end_write_request(struct bio *bio)
 	int mirror = find_bio_disk(r1_bio, bio);
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
 	sector_t lo = r1_bio->sector;
-	sector_t hi = r1_bio->sector + r1_bio->sectors;
+	sector_t hi = r1_bio->sector + r1_bio->sectors - 1;
 	bool ignore_error = !raid1_should_handle_error(bio) ||
 		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
 
@@ -1878,7 +1901,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
 	if (info->rdev)
 		return false;
 
-	if (bdev_nonrot(rdev->bdev)) {
+	if (!bdev_rot(rdev->bdev)) {
 		set_bit(Nonrot, &rdev->flags);
 		WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
 	}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0653b5d8545a..4901ebe45c87 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 		if (!do_balance)
 			break;
 
-		nonrot = bdev_nonrot(rdev->bdev);
+		nonrot = !bdev_rot(rdev->bdev);
 		has_nonrot_disk |= nonrot;
 		pending = atomic_read(&rdev->nr_pending);
 		if (min_pending > pending && nonrot) {
@@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 
 	if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
-		raid_end_bio_io(r10_bio);
+		free_r10bio(r10_bio);
 		return;
 	}
 
@@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 	sectors = r10_bio->sectors;
 	if (!regular_request_wait(mddev, conf, bio, sectors)) {
-		raid_end_bio_io(r10_bio);
+		free_r10bio(r10_bio);
 		return;
 	}
 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 66b10cbda96d..7b7546bfa21f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 		return -ENOMEM;
 
 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		sector_t payload_len;
+
 		payload = (void *)mb + mb_offset;
 		payload_flush = (void *)mb + mb_offset;
 
 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+			payload_len = sizeof(struct r5l_payload_data_parity) +
+				(sector_t)sizeof(__le32) *
+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
 				goto mismatch;
 		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
+			payload_len = sizeof(struct r5l_payload_data_parity) +
+				(sector_t)sizeof(__le32) *
+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 				    payload->checksum[1]) < 0)
 				goto mismatch;
 		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
-			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+			payload_len = sizeof(struct r5l_payload_flush) +
+				(sector_t)le32_to_cpu(payload_flush->size);
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
 			goto mismatch;
 
-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
-			mb_offset += sizeof(struct r5l_payload_flush) +
-				le32_to_cpu(payload_flush->size);
-		} else {
-			/* DATA or PARITY payload */
+		if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
 			log_offset = r5l_ring_add(log, log_offset,
 						  le32_to_cpu(payload->size));
-			mb_offset += sizeof(struct r5l_payload_data_parity) +
-				sizeof(__le32) *
-				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 		}
-
+		mb_offset += payload_len;
 	}
 
 	put_page(page);
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
 
 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		sector_t payload_len;
 		int dd;
 
 		payload = (void *)mb + mb_offset;
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
 			int i, count;
 
+			payload_len = sizeof(struct r5l_payload_flush) +
+				(sector_t)le32_to_cpu(payload_flush->size);
+			if (mb_offset + payload_len >
+			    le32_to_cpu(mb->meta_size))
+				return -EINVAL;
+
 			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
 			for (i = 0; i < count; ++i) {
 				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				}
 			}
 
-			mb_offset += sizeof(struct r5l_payload_flush) +
-				le32_to_cpu(payload_flush->size);
+			mb_offset += payload_len;
 			continue;
 		}
 
 		/* DATA or PARITY payload */
+		payload_len = sizeof(struct r5l_payload_data_parity) +
+			(sector_t)sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+		if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+			return -EINVAL;
+
 		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
 			raid5_compute_sector(
 				conf, le64_to_cpu(payload->location), 0, &dd,
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		log_offset = r5l_ring_add(log, log_offset,
 					  le32_to_cpu(payload->size));
 
-		mb_offset += sizeof(struct r5l_payload_data_parity) +
-			sizeof(__le32) *
-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+		mb_offset += payload_len;
 	}
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a8e8d431071b..6e79829c5acb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 					break;
 			}
 			BUG_ON(other < 0);
+			if (test_bit(R5_LOCKED, &sh->dev[other].flags))
+				return 0;
 			pr_debug("Computing stripe %llu blocks %d,%d\n",
 			       (unsigned long long)sh->sector,
 			       disk_idx, other);
@@ -4594,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 	async_tx_quiesce(&tx);
 }
 
-/*
- * handle_stripe - do things to a stripe.
- *
- * We lock the stripe by setting STRIPE_ACTIVE and then examine the
- * state of various bits to see what needs to be done.
- * Possible results:
- *    return some read requests which now have data
- *    return some write requests which are safely on storage
- *    schedule a read on some buffers
- *    schedule a write of some buffers
- *    return confirmation of parity correctness
- *
- */
-
 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -4901,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * state of various bits to see what needs to be done.
+ * Possible results:
+ *    return some read requests which now have data
+ *    return some write requests which are safely on storage
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ */
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -6641,7 +6641,13 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
 		}
 
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
-			raid5_release_stripe(sh);
+			int hash;
+
+			spin_lock_irq(&conf->device_lock);
+			hash = sh->hash_lock_index;
+			__release_stripe(conf, sh,
+					 &conf->temp_inactive_list[hash]);
+			spin_unlock_irq(&conf->device_lock);
 			conf->retry_read_aligned = raid_bio;
 			conf->retry_read_offset = scnt;
 			return handled;
@@ -7541,7 +7547,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		if (test_bit(Journal, &rdev->flags))
 			continue;
-		if (bdev_nonrot(rdev->bdev)) {
+		if (!bdev_rot(rdev->bdev)) {
 			conf->batch_bio_dispatch = false;
 			break;
 		}
@@ -7780,6 +7786,7 @@ static int raid5_set_limits(struct mddev *mddev)
 	lim.logical_block_size = mddev->logical_block_size;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
+	lim.chunk_sectors = lim.io_opt >> 9;
 	lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
 	lim.discard_granularity = stripe;
 	lim.max_write_zeroes_sectors = 0;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 110b1c2d0a86..1c7b710fc9c1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
 }
 #endif
 
-void md_raid5_kick_device(struct r5conf *conf);
 int raid5_set_cache_size(struct mddev *mddev, int size);
 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
 void raid5_release_stripe(struct stripe_head *sh);