diff options
Diffstat (limited to 'drivers/md')
81 files changed, 1697 insertions, 944 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index c58a9a8ea54e..a3fcdca7e6db 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -226,6 +226,7 @@ config BLK_DEV_DM select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN select BLK_MQ_STACKING + select CRYPTO_LIB_SHA256 if IMA depends on DAX || DAX=n help Device-mapper is a low level volume manager. It works by allowing @@ -299,6 +300,7 @@ config DM_CRYPT select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV + select CRYPTO_LIB_AES select CRYPTO_LIB_MD5 # needed by lmk IV mode help This device-mapper target allows you to create a device that diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 64bb38c95895..97d9adb0bf96 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1373,6 +1373,14 @@ static CLOSURE_CALLBACK(cached_dev_free) mutex_unlock(&bch_register_lock); + /* + * Wait for any pending sb_write to complete before free. + * The sb_bio is embedded in struct cached_dev, so we must + * ensure no I/O is in progress. + */ + down(&dc->sb_write_mutex); + up(&dc->sb_write_mutex); + if (dc->sb_disk) folio_put(virt_to_folio(dc->sb_disk)); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 60f7badec91f..26fedf5883ef 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -391,7 +391,7 @@ struct dm_buffer_cache { */ unsigned int num_locks; bool no_sleep; - struct buffer_tree trees[]; + struct buffer_tree trees[] __counted_by(num_locks); }; static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); @@ -2511,7 +2511,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign } num_locks = dm_num_hash_locks(); - c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL); + c = kzalloc_flex(*c, cache.trees, num_locks); if (!c) { r = -ENOMEM; goto bad_client; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 57158c02d096..acd9b179fcb3 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1023,6 +1023,12 @@ static bool cmd_write_lock(struct dm_cache_metadata *cmd) return; \ } while (0) +#define WRITE_LOCK_OR_GOTO(cmd, label) \ + do { \ + if (!cmd_write_lock((cmd))) \ + goto label; \ + } while (0) + #define WRITE_UNLOCK(cmd) \ up_write(&(cmd)->root_lock) @@ -1714,17 +1720,6 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * return r; } -int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) -{ - int r; - - READ_LOCK(cmd); - r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); - READ_UNLOCK(cmd); - - return r; -} - void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) { WRITE_LOCK_VOID(cmd); @@ -1791,11 +1786,8 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); - WRITE_LOCK(cmd); - if (cmd->fail_io) { - WRITE_UNLOCK(cmd); - goto out; - } + /* cmd_write_lock() already checks fail_io with cmd->root_lock held */ + WRITE_LOCK_OR_GOTO(cmd, out); __destroy_persistent_data_objects(cmd, false); old_bm = cmd->bm; @@ -1824,3 +1816,12 @@ out: return r; } + +int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result) +{ + READ_LOCK(cmd); + *result = cmd->clean_when_opened; + READ_UNLOCK(cmd); + + return 0; +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 5f77890207fe..91f8706b41fd 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -135,17 +135,17 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, */ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); -/* - * Query method. Are all the blocks in the cache clean? - */ -int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); - int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); int dm_cache_metadata_abort(struct dm_cache_metadata *cmd); +/* + * Query method. Was the metadata cleanly shut down when opened? + */ +int dm_cache_metadata_clean_when_opened(struct dm_cache_metadata *cmd, bool *result); + /*----------------------------------------------------------------*/ #endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index b328d9601046..dd77a93fd68d 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1589,14 +1589,18 @@ static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) { struct smq_policy *mq = to_smq_policy(p); struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + unsigned long flags; if (!e->allocated) return -ENODATA; + spin_lock_irqsave(&mq->lock, flags); // FIXME: what if this block has pending background work? del_queue(mq, e); h_remove(&mq->table, e); free_entry(&mq->cache_alloc, e); + spin_unlock_irqrestore(&mq->lock, flags); + return 0; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 935ab79b1d0c..097315a9bf0f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1462,11 +1462,19 @@ static void invalidate_complete(struct dm_cache_migration *mg, bool success) struct cache *cache = mg->cache; bio_list_init(&bios); - if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) - free_prison_cell(cache, mg->cell); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } - if (!success && mg->overwrite_bio) - bio_io_error(mg->overwrite_bio); + if (mg->overwrite_bio) { + // Set generic error if the bio hasn't been issued yet, + // e.g., invalidation or metadata commit failed before bio + // submission. Otherwise preserve the bio's own error status. + if (!success && !mg->overwrite_bio->bi_status) + mg->overwrite_bio->bi_status = BLK_STS_IOERR; + bio_endio(mg->overwrite_bio); + } free_migration(mg); defer_bios(cache, &bios); @@ -1506,6 +1514,24 @@ static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) return r; } +static void invalidate_committed(struct work_struct *ws) +{ + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; + struct bio *bio = mg->overwrite_bio; + struct per_bio_data *pb = get_per_bio_data(bio); + + if (mg->k.input) { + invalidate_complete(mg, false); + return; + } + + init_continuation(&mg->k, invalidate_completed); + remap_to_origin_clear_discard(cache, bio, mg->invalidate_oblock); + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + dm_submit_bio_remap(bio, NULL); +} + static void invalidate_remove(struct work_struct *ws) { int r; @@ -1518,10 +1544,8 @@ static void invalidate_remove(struct work_struct *ws) return; } - init_continuation(&mg->k, invalidate_completed); + init_continuation(&mg->k, invalidate_committed); continue_after_commit(&cache->committer, &mg->k); - remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); - mg->overwrite_bio = NULL; schedule_commit(&cache->committer); } @@ -1539,6 +1563,15 @@ static int invalidate_lock(struct dm_cache_migration *mg) READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); if (r < 0) { free_prison_cell(cache, prealloc); + + /* Defer the bio for retrying the cell lock */ + if (mg->overwrite_bio) { + struct bio *bio = mg->overwrite_bio; + + mg->overwrite_bio = NULL; + defer_bio(cache, bio); + } + invalidate_complete(mg, false); return r; } @@ -1701,6 +1734,7 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); invalidate_start(cache, cblock, block, bio); + return DM_MAPIO_SUBMITTED; } else remap_to_origin_clear_discard(cache, bio, block); } else { @@ -2467,23 +2501,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - if (passthrough_mode(cache)) { - bool all_clean; - - r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); - if (r) { - *error = "dm_cache_metadata_all_clean() failed"; - goto bad; - } - - if (!all_clean) { - *error = "Cannot enter passthrough mode unless all blocks are clean"; - r = -EINVAL; - goto bad; - } - + if (passthrough_mode(cache)) policy_allow_migrations(cache->policy, false); - } spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); @@ -2810,6 +2829,12 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, struct cache *cache = context; if (dirty) { + if (passthrough_mode(cache)) { + DMERR("%s: cannot enter passthrough mode unless all blocks are clean", + cache_device_name(cache)); + return -EBUSY; + } + set_bit(from_cblock(cblock), cache->dirty_bitset); atomic_inc(&cache->nr_dirty); } else @@ -2929,6 +2954,9 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) static bool can_resume(struct cache *cache) { + bool clean_when_opened; + int r; + /* * Disallow retrying the resume operation for devices that failed the * first resume attempt, as the failure leaves the policy object partially @@ -2945,6 +2973,20 @@ static bool can_resume(struct cache *cache) return false; } + if (passthrough_mode(cache)) { + r = dm_cache_metadata_clean_when_opened(cache->cmd, &clean_when_opened); + if (r) { + DMERR("%s: failed to query metadata flags", cache_device_name(cache)); + return false; + } + + if (!clean_when_opened) { + DMERR("%s: unable to resume into passthrough mode after unclean shutdown", + cache_device_name(cache)); + return false; + } + } + return true; } @@ -3043,7 +3085,7 @@ static int cache_preresume(struct dm_target *ti) load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - if (r != -EFBIG) + if (r != -EFBIG && r != -EBUSY) metadata_operation_failed(cache, "dm_cache_load_mappings", r); return r; } @@ -3510,7 +3552,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 3, 0}, + .version = {2, 4, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 54823341c9fd..608b617fb817 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -32,6 +32,7 @@ #include <linux/ctype.h> #include <asm/page.h> #include <linux/unaligned.h> +#include <crypto/aes.h> #include <crypto/hash.h> #include <crypto/md5.h> #include <crypto/skcipher.h> @@ -109,11 +110,11 @@ struct crypt_iv_operations { const char *opts); void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); - int (*wipe)(struct crypt_config *cc); + void (*wipe)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq); - int (*post)(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq); + void (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); }; struct iv_benbi_private { @@ -133,7 +134,7 @@ struct iv_tcw_private { #define ELEPHANT_MAX_KEY_SIZE 32 struct iv_elephant_private { - struct crypto_skcipher *tfm; + struct aes_enckey *key; }; /* @@ -507,14 +508,12 @@ static int crypt_iv_lmk_init(struct crypt_config *cc) return 0; } -static int crypt_iv_lmk_wipe(struct crypt_config *cc) +static void crypt_iv_lmk_wipe(struct crypt_config *cc) { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; if (lmk->seed) memset(lmk->seed, 0, LMK_SEED_SIZE); - - return 0; } static void crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, @@ -560,14 +559,14 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, return 0; } -static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { struct scatterlist *sg; u8 *dst; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) - return 0; + return; sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); @@ -577,7 +576,6 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, crypto_xor(dst + sg->offset, iv, cc->iv_size); kunmap_local(dst); - return 0; } static void crypt_iv_tcw_dtr(struct crypt_config *cc) @@ -628,14 +626,12 @@ static int crypt_iv_tcw_init(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_wipe(struct crypt_config *cc) +static void crypt_iv_tcw_wipe(struct crypt_config *cc) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; memset(tcw->iv_seed, 0, cc->iv_size); memset(tcw->whitening, 0, TCW_WHITENING_SIZE); - - return 0; } static void crypt_iv_tcw_whitening(struct crypt_config *cc, @@ -687,22 +683,20 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, return 0; } -static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { struct scatterlist *sg; u8 *dst; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) - return 0; + return; /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - - return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, @@ -767,8 +761,8 @@ static void crypt_iv_elephant_dtr(struct crypt_config *cc) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - crypto_free_skcipher(elephant->tfm); - elephant->tfm = NULL; + kfree_sensitive(elephant->key); + elephant->key = NULL; } static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -777,13 +771,9 @@ static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; int r; - elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(elephant->tfm)) { - r = PTR_ERR(elephant->tfm); - elephant->tfm = NULL; - return r; - } + elephant->key = kmalloc_obj(*elephant->key); + if (!elephant->key) + return -ENOMEM; r = crypt_iv_eboiv_ctr(cc, ti, NULL); if (r) @@ -935,41 +925,28 @@ static void diffuser_b_encrypt(u32 *d, size_t n) } } -static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq) +static void crypt_iv_elephant(struct crypt_config *cc, + struct dm_crypt_request *dmreq) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - u8 *es, *ks, *data, *data2, *data_offset; - struct skcipher_request *req; - struct scatterlist *sg, *sg2, src, dst; - DECLARE_CRYPTO_WAIT(wait); - int i, r; - - req = skcipher_request_alloc(elephant->tfm, GFP_NOIO); - es = kzalloc(16, GFP_NOIO); /* Key for AES */ - ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */ - - if (!req || !es || !ks) { - r = -ENOMEM; - goto out; - } + u8 *data, *data2, *data_offset; + struct scatterlist *sg, *sg2; + union { + __le64 w[2]; + u8 b[16]; + } es; + u8 ks[32] __aligned(__alignof(long)); /* Elephant sector key */ + int i; - *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + es.w[0] = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + es.w[1] = 0; /* E(Ks, e(s)) */ - sg_init_one(&src, es, 16); - sg_init_one(&dst, ks, 16); - skcipher_request_set_crypt(req, &src, &dst, 16, NULL); - skcipher_request_set_callback(req, 0, crypto_req_done, &wait); - r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - if (r) - goto out; + aes_encrypt(elephant->key, &ks[0], es.b); /* E(Ks, e'(s)) */ - es[15] = 0x80; - sg_init_one(&dst, &ks[16], 16); - r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - if (r) - goto out; + es.b[15] = 0x80; + aes_encrypt(elephant->key, &ks[16], es.b); sg = crypt_get_sg_data(cc, dmreq->sg_out); data = kmap_local_page(sg_page(sg)); @@ -1001,34 +978,24 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d } kunmap_local(data); -out: - kfree_sensitive(ks); - kfree_sensitive(es); - skcipher_request_free(req); - return r; + memzero_explicit(ks, sizeof(ks)); + memzero_explicit(&es, sizeof(es)); } static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { - int r; - - if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - r = crypt_iv_elephant(cc, dmreq); - if (r) - return r; - } + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + crypt_iv_elephant(cc, dmreq); return crypt_iv_eboiv_gen(cc, iv, dmreq); } -static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, - struct dm_crypt_request *dmreq) +static void crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) - return crypt_iv_elephant(cc, dmreq); - - return 0; + crypt_iv_elephant(cc, dmreq); } static int crypt_iv_elephant_init(struct crypt_config *cc) @@ -1036,16 +1003,14 @@ static int crypt_iv_elephant_init(struct crypt_config *cc) struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; int key_offset = cc->key_size - cc->key_extra_size; - return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size); + return aes_prepareenckey(elephant->key, &cc->key[key_offset], cc->key_extra_size); } -static int crypt_iv_elephant_wipe(struct crypt_config *cc) +static void crypt_iv_elephant_wipe(struct crypt_config *cc) { struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; - u8 key[ELEPHANT_MAX_KEY_SIZE]; - memset(key, 0, cc->key_extra_size); - return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size); + memzero_explicit(elephant->key, sizeof(*elephant->key)); } static const struct crypt_iv_operations crypt_iv_plain_ops = { @@ -1376,7 +1341,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + cc->iv_gen_ops->post(cc, org_iv, dmreq); bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); @@ -1453,7 +1418,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, r = crypto_skcipher_decrypt(req); if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) - r = cc->iv_gen_ops->post(cc, org_iv, dmreq); + cc->iv_gen_ops->post(cc, org_iv, dmreq); bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size); bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size); @@ -2217,7 +2182,7 @@ static void kcryptd_async_done(void *data, int error) } if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) - error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); + cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); if (error == -EBADMSG) { sector_t s = le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)); @@ -2673,11 +2638,8 @@ static int crypt_wipe_key(struct crypt_config *cc) get_random_bytes(&cc->key, cc->key_size); /* Wipe IV private keys */ - if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { - r = cc->iv_gen_ops->wipe(cc); - if (r) - return r; - } + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) + cc->iv_gen_ops->wipe(cc); kfree_sensitive(cc->key_string); cc->key_string = NULL; @@ -3717,11 +3679,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct crypt_config *cc = ti->private; - limits->logical_block_size = - max_t(unsigned int, limits->logical_block_size, cc->sector_size); - limits->physical_block_size = - max_t(unsigned int, limits->physical_block_size, cc->sector_size); - limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size); + dm_stack_bs_limits(limits, cc->sector_size); limits->dma_alignment = limits->logical_block_size - 1; /* diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index efb3cd4f9cd4..9495ca035056 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -12,9 +12,7 @@ #include <linux/ima.h> #include <linux/sched/mm.h> -#include <crypto/hash.h> -#include <linux/crypto.h> -#include <crypto/hash_info.h> +#include <crypto/sha2.h> #define DM_MSG_PREFIX "ima" @@ -178,19 +176,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl size_t device_data_buf_len, target_metadata_buf_len, target_data_buf_len, l = 0; char *target_metadata_buf = NULL, *target_data_buf = NULL, *digest_buf = NULL; char *ima_buf = NULL, *device_data_buf = NULL; - int digest_size, last_target_measured = -1, r; + int last_target_measured = -1; status_type_t type = STATUSTYPE_IMA; size_t cur_total_buf_len = 0; unsigned int num_targets, i; - SHASH_DESC_ON_STACK(shash, NULL); - struct crypto_shash *tfm = NULL; - u8 *digest = NULL; + struct sha256_ctx hash_ctx; + u8 digest[SHA256_DIGEST_SIZE]; bool noio = false; - /* - * In below hash_alg_prefix_len assignment +1 is for the additional char (':'), - * when prefixing the hash value with the hash algorithm name. e.g. sha256:<hash_value>. - */ - const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; char table_load_event_name[] = "dm_table_load"; ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio); @@ -210,19 +202,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) goto error; - tfm = crypto_alloc_shash(DM_IMA_TABLE_HASH_ALG, 0, 0); - if (IS_ERR(tfm)) - goto error; - - shash->tfm = tfm; - digest_size = crypto_shash_digestsize(tfm); - digest = dm_ima_alloc(digest_size, noio); - if (!digest) - goto error; - - r = crypto_shash_init(shash); - if (r) - goto error; + sha256_init(&hash_ctx); memcpy(ima_buf + l, DM_IMA_VERSION_STR, table->md->ima.dm_version_str_len); l += table->md->ima.dm_version_str_len; @@ -270,9 +250,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl */ if (unlikely(cur_total_buf_len >= DM_IMA_MEASUREMENT_BUF_LEN)) { dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); - r = crypto_shash_update(shash, (const u8 *)ima_buf, l); - if (r < 0) - goto error; + sha256_update(&hash_ctx, (const u8 *)ima_buf, l); memset(ima_buf, 0, DM_IMA_MEASUREMENT_BUF_LEN); l = 0; @@ -311,9 +289,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (!last_target_measured) { dm_ima_measure_data(table_load_event_name, ima_buf, l, noio); - r = crypto_shash_update(shash, (const u8 *)ima_buf, l); - if (r < 0) - goto error; + sha256_update(&hash_ctx, (const u8 *)ima_buf, l); } /* @@ -321,20 +297,13 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl * so that the table data can be verified against the future device state change * events, e.g. resume, rename, remove, table-clear etc. */ - r = crypto_shash_final(shash, digest); - if (r < 0) - goto error; - - digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio); + sha256_final(&hash_ctx, digest); + digest_buf = kasprintf(GFP_KERNEL, "sha256:%*phN", SHA256_DIGEST_SIZE, + digest); if (!digest_buf) goto error; - snprintf(digest_buf, hash_alg_prefix_len + 1, "%s:", DM_IMA_TABLE_HASH_ALG); - - for (i = 0; i < digest_size; i++) - snprintf((digest_buf + hash_alg_prefix_len + (i*2)), 3, "%02x", digest[i]); - if (table->md->ima.active_table.hash != table->md->ima.inactive_table.hash) kfree(table->md->ima.inactive_table.hash); @@ -354,9 +323,6 @@ error: kfree(digest_buf); kfree(device_data_buf); exit: - kfree(digest); - if (tfm) - crypto_free_shash(tfm); kfree(ima_buf); kfree(target_metadata_buf); kfree(target_data_buf); diff --git a/drivers/md/dm-ima.h b/drivers/md/dm-ima.h index 568870a1a145..a403deca6093 100644 --- a/drivers/md/dm-ima.h +++ b/drivers/md/dm-ima.h @@ -15,7 +15,6 @@ #define DM_IMA_TARGET_METADATA_BUF_LEN 128 #define DM_IMA_TARGET_DATA_BUF_LEN 2048 #define DM_IMA_DEVICE_CAPACITY_BUF_LEN 128 -#define DM_IMA_TABLE_HASH_ALG "sha256" #define __dm_ima_stringify(s) #s #define __dm_ima_str(s) __dm_ima_stringify(s) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 7403823384c5..c1bacba92c65 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -303,8 +303,10 @@ static int __init dm_init_init(void) } } - if (waitfor[0]) + if (waitfor[0]) { + wait_for_device_probe(); DMINFO("all devices available"); + } list_for_each_entry(dev, &devices, list) { if (dm_early_create(&dev->dmi, dev->table, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 06e805902151..65c30dec8222 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4046,13 +4046,9 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim { struct dm_integrity_c *ic = ti->private; - if (ic->sectors_per_block > 1) { - limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT; - limits->io_min = ic->sectors_per_block << SECTOR_SHIFT; - limits->dma_alignment = limits->logical_block_size - 1; - limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; - } + dm_stack_bs_limits(limits, ic->sectors_per_block << SECTOR_SHIFT); + limits->dma_alignment = limits->logical_block_size - 1; + limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; if (!ic->internal_hash) { struct blk_integrity *bi = &limits->integrity; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3ab8b4beff86..a529174c94cf 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -64,7 +64,11 @@ struct vers_iter { static struct rb_root name_rb_tree = RB_ROOT; static struct rb_root uuid_rb_tree = RB_ROOT; -static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); +#define DM_REMOVE_KEEP_OPEN_DEVICES 1 +#define DM_REMOVE_MARK_DEFERRED 2 +#define DM_REMOVE_ONLY_DEFERRED 4 +#define DM_REMOVE_INTERRUPTIBLE 8 +static int dm_hash_remove_all(unsigned flags); /* * Guards access to both hash tables. @@ -78,7 +82,7 @@ static DEFINE_MUTEX(dm_hash_cells_mutex); static void dm_hash_exit(void) { - dm_hash_remove_all(false, false, false); + dm_hash_remove_all(0); } /* @@ -333,7 +337,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) return table; } -static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) +static int dm_hash_remove_all(unsigned flags) { int dev_skipped; struct rb_node *n; @@ -347,12 +351,17 @@ retry: down_write(&_hash_lock); for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) { + if (flags & DM_REMOVE_INTERRUPTIBLE && fatal_signal_pending(current)) { + up_write(&_hash_lock); + return -EINTR; + } + hc = container_of(n, struct hash_cell, name_node); md = hc->md; dm_get(md); - if (keep_open_devices && - dm_lock_for_deletion(md, mark_deferred, only_deferred)) { + if (flags & DM_REMOVE_KEEP_OPEN_DEVICES && + dm_lock_for_deletion(md, !!(flags & DM_REMOVE_MARK_DEFERRED), !!(flags & DM_REMOVE_ONLY_DEFERRED))) { dm_put(md); dev_skipped++; continue; @@ -368,7 +377,7 @@ retry: } dm_ima_measure_on_device_remove(md, true); dm_put(md); - if (likely(keep_open_devices)) + if (likely(flags & DM_REMOVE_KEEP_OPEN_DEVICES)) dm_destroy(md); else dm_destroy_immediate(md); @@ -384,8 +393,10 @@ retry: up_write(&_hash_lock); - if (dev_skipped) + if (dev_skipped && !(flags & DM_REMOVE_ONLY_DEFERRED)) DMWARN("remove_all left %d open device(s)", dev_skipped); + + return 0; } /* @@ -513,7 +524,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, void dm_deferred_remove(void) { - dm_hash_remove_all(true, false, true); + dm_hash_remove_all(DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_ONLY_DEFERRED); } /* @@ -529,9 +540,13 @@ typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); + int r; + int flags = DM_REMOVE_KEEP_OPEN_DEVICES | DM_REMOVE_INTERRUPTIBLE; + if (param->flags & DM_DEFERRED_REMOVE) + flags |= DM_REMOVE_MARK_DEFERRED; + r = dm_hash_remove_all(flags); param->data_size = 0; - return 0; + return r; } /* @@ -1341,6 +1356,10 @@ static void retrieve_status(struct dm_table *table, used = param->data_start + (outptr - outbuf); outptr = align_ptr(outptr); + if (!outptr || outptr > outbuf + len) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } spec->next = outptr - outbuf; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 1aa6a4a7d232..d316757a328b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -373,7 +373,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, struct log_c *lc; uint32_t region_size; - unsigned int region_count; + sector_t region_count; size_t bitset_size, buf_size; int r; char dummy; @@ -401,6 +401,10 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } region_count = dm_sector_div_up(ti->len, region_size); + if (region_count > UINT_MAX) { + DMWARN("region count exceeds limit of %u", UINT_MAX); + return -EINVAL; + } lc = kmalloc_obj(*lc); if (!lc) { diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 8f4ae2f51545..7cb7bb6233b6 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -102,7 +102,6 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ - bool is_suspending; }; /* @@ -1749,9 +1748,6 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; - spin_lock_irq(&m->lock); - m->is_suspending = true; - spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1774,7 +1770,6 @@ static void multipath_resume(struct dm_target *ti) struct multipath *m = ti->private; spin_lock_irq(&m->lock); - m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -2098,7 +2093,7 @@ static int probe_active_paths(struct multipath *m) if (m->current_pg == m->last_probed_pg) goto skip_probe; } - if (!m->current_pg || m->is_suspending || + if (!m->current_pg || dm_suspended(m->ti) || test_bit(MPATHF_QUEUE_IO, &m->flags)) goto skip_probe; set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); @@ -2107,7 +2102,7 @@ static int probe_active_paths(struct multipath *m) list_for_each_entry(pgpath, &pg->pgpaths, list) { if (pg != READ_ONCE(m->current_pg) || - READ_ONCE(m->is_suspending)) + dm_suspended(m->ti)) goto out; if (!pgpath->is_active) continue; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 80a5c4127707..de5c00704e69 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -993,13 +993,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - *args_used = 2 + param_count; - - if (argc < *args_used) { + if (param_count > argc - 2) { ti->error = "Insufficient mirror log arguments"; return NULL; } + *args_used = 2 + param_count; + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, argv + 2); if (!dl) { diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c index e3bba0b28aad..b8a3977b815d 100644 --- a/drivers/md/dm-vdo/action-manager.c +++ b/drivers/md/dm-vdo/action-manager.c @@ -107,7 +107,7 @@ int vdo_make_action_manager(zone_count_t zones, struct action_manager **manager_ptr) { struct action_manager *manager; - int result = vdo_allocate(1, struct action_manager, __func__, &manager); + int result = vdo_allocate(1, __func__, &manager); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index a7db5b41155e..5ffc360540ed 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -221,8 +221,7 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache) u64 size = cache->page_count * (u64) VDO_BLOCK_SIZE; int result; - result = vdo_allocate(cache->page_count, struct page_info, "page infos", - &cache->infos); + result = vdo_allocate(cache->page_count, "page infos", &cache->infos); if (result != VDO_SUCCESS) return result; @@ -2364,18 +2363,15 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages, forest->segments = index + 1; - result = vdo_allocate(forest->segments, struct boundary, - "forest boundary array", &forest->boundaries); + result = vdo_allocate(forest->segments, "forest boundary array", &forest->boundaries); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(forest->segments, struct tree_page *, - "forest page pointers", &forest->pages); + result = vdo_allocate(forest->segments, "forest page pointers", &forest->pages); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(new_pages, struct tree_page, - "new forest pages", &forest->pages[index]); + result = vdo_allocate(new_pages, "new forest pages", &forest->pages[index]); if (result != VDO_SUCCESS) return result; @@ -2400,9 +2396,7 @@ static int make_segment(struct forest *old_forest, block_count_t new_pages, struct block_map_tree *tree = &(forest->trees[root]); height_t height; - int result = vdo_allocate(forest->segments, - struct block_map_tree_segment, - "tree root segments", &tree->segments); + result = vdo_allocate(forest->segments, "tree root segments", &tree->segments); if (result != VDO_SUCCESS) return result; @@ -2478,9 +2472,7 @@ static int make_forest(struct block_map *map, block_count_t entries) return VDO_SUCCESS; } - result = vdo_allocate_extended(struct forest, map->root_count, - struct block_map_tree, __func__, - &forest); + result = vdo_allocate_extended(map->root_count, trees, __func__, &forest); if (result != VDO_SUCCESS) return result; @@ -2707,8 +2699,7 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, struct cursors *cursors; int result; - result = vdo_allocate_extended(struct cursors, map->root_count, - struct cursor, __func__, &cursors); + result = vdo_allocate_extended(map->root_count, cursors, __func__, &cursors); if (result != VDO_SUCCESS) { vdo_fail_completion(completion, result); return; @@ -2758,9 +2749,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, zone->thread_id = vdo->thread_config.logical_threads[zone_number]; zone->block_map = map; - result = vdo_allocate_extended(struct dirty_lists, maximum_age, - dirty_era_t, __func__, - &zone->dirty_lists); + result = vdo_allocate_extended(maximum_age, eras, __func__, &zone->dirty_lists); if (result != VDO_SUCCESS) return result; @@ -2900,9 +2889,8 @@ int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical if (result != VDO_SUCCESS) return result; - result = vdo_allocate_extended(struct block_map, - vdo->thread_config.logical_zone_count, - struct block_map_zone, __func__, &map); + result = vdo_allocate_extended(vdo->thread_config.logical_zone_count, + zones, __func__, &map); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h index 39a13039e4a3..4fd24043b0d7 100644 --- a/drivers/md/dm-vdo/block-map.h +++ b/drivers/md/dm-vdo/block-map.h @@ -276,7 +276,7 @@ struct block_map { block_count_t next_entry_count; zone_count_t zone_count; - struct block_map_zone zones[]; + struct block_map_zone zones[] __counted_by(zone_count); }; /** diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index 2a8b03779f87..b84e7edeb22e 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,6 +44,9 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, + /* The recovery journal starting sequence number set at format time */ + RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER = 1, + /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely @@ -57,8 +60,14 @@ enum { /* The maximum number of physical zones */ MAX_VDO_PHYSICAL_ZONES = 16, - /* The base-2 logarithm of the maximum blocks in one slab */ - MAX_VDO_SLAB_BITS = 23, + /* The default blocks in one slab */ + DEFAULT_VDO_SLAB_BLOCKS = 1U << 19, + + /* The minimum blocks in one slab */ + MIN_VDO_SLAB_BLOCKS = 1U << 13, + + /* The maximum blocks in one slab */ + MAX_VDO_SLAB_BLOCKS = 1U << 23, /* The maximum number of slabs the slab depot supports */ MAX_VDO_SLABS = 8192, diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 3333e1e5b02e..370d4239ba31 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -842,8 +842,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size, struct data_vio_pool *pool; data_vio_count_t i; - result = vdo_allocate_extended(struct data_vio_pool, pool_size, struct data_vio, - __func__, &pool); + result = vdo_allocate_extended(pool_size, data_vios, __func__, &pool); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 75a26f3f4461..5f5639d89bc6 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -296,7 +296,7 @@ struct hash_zones { /* The number of zones */ zone_count_t zone_count; /* The hash zones themselves */ - struct hash_zone zones[]; + struct hash_zone zones[] __counted_by(zone_count); }; /* These are in milliseconds. */ @@ -2364,8 +2364,7 @@ static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zone vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback, zone->thread_id); INIT_LIST_HEAD(&zone->lock_pool); - result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array", - &zone->lock_array); + result = vdo_allocate(LOCK_POOL_CAPACITY, "hash_lock array", &zone->lock_array); if (result != VDO_SUCCESS) return result; @@ -2418,8 +2417,7 @@ int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone, - __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index 6af40d40f255..1d8375cc3c3e 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <linux/device-mapper.h> #include <linux/err.h> +#include <linux/log2.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> @@ -60,6 +61,11 @@ enum admin_phases { LOAD_PHASE_DRAIN_JOURNAL, LOAD_PHASE_WAIT_FOR_READ_ONLY, PRE_LOAD_PHASE_START, + PRE_LOAD_PHASE_FORMAT_START, + PRE_LOAD_PHASE_FORMAT_SUPER, + PRE_LOAD_PHASE_FORMAT_GEOMETRY, + PRE_LOAD_PHASE_FORMAT_END, + PRE_LOAD_PHASE_LOAD_SUPER, PRE_LOAD_PHASE_LOAD_COMPONENTS, PRE_LOAD_PHASE_END, PREPARE_GROW_PHYSICAL_PHASE_START, @@ -109,6 +115,11 @@ static const char * const ADMIN_PHASE_NAMES[] = { "LOAD_PHASE_DRAIN_JOURNAL", "LOAD_PHASE_WAIT_FOR_READ_ONLY", "PRE_LOAD_PHASE_START", + "PRE_LOAD_PHASE_FORMAT_START", + "PRE_LOAD_PHASE_FORMAT_SUPER", + "PRE_LOAD_PHASE_FORMAT_GEOMETRY", + "PRE_LOAD_PHASE_FORMAT_END", + "PRE_LOAD_PHASE_LOAD_SUPER", "PRE_LOAD_PHASE_LOAD_COMPONENTS", "PRE_LOAD_PHASE_END", "PREPARE_GROW_PHYSICAL_PHASE_START", @@ -273,8 +284,7 @@ static int split_string(const char *string, char separator, char ***substring_ar substring_count++; } - result = vdo_allocate(substring_count + 1, char *, "string-splitting array", - &substrings); + result = vdo_allocate(substring_count + 1, "string-splitting array", &substrings); if (result != VDO_SUCCESS) return result; @@ -282,7 +292,7 @@ static int split_string(const char *string, char separator, char ***substring_ar if (*s == separator) { ptrdiff_t length = s - string; - result = vdo_allocate(length + 1, char, "split string", + result = vdo_allocate(length + 1, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); @@ -303,8 +313,7 @@ static int split_string(const char *string, char separator, char ***substring_ar BUG_ON(current_substring != (substring_count - 1)); length = strlen(string); - result = vdo_allocate(length + 1, char, "split string", - &substrings[current_substring]); + result = vdo_allocate(length + 1, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); return result; @@ -332,7 +341,7 @@ static int join_strings(char **substring_array, size_t array_length, char separa for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) string_length += strlen(substring_array[i]) + 1; - result = vdo_allocate(string_length, char, __func__, &output); + result = vdo_allocate(string_length, __func__, &output); if (result != VDO_SUCCESS) return result; @@ -380,6 +389,75 @@ static inline int __must_check parse_bool(const char *bool_str, const char *true } /** + * parse_memory() - Parse a string into an index memory value. + * @memory_str: The string value to convert to a memory value. + * @memory_ptr: A pointer to return the memory value in. + * + * Return: VDO_SUCCESS or an error + */ +static int __must_check parse_memory(const char *memory_str, + uds_memory_config_size_t *memory_ptr) +{ + uds_memory_config_size_t memory; + + if (strcmp(memory_str, "0.25") == 0) { + memory = UDS_MEMORY_CONFIG_256MB; + } else if ((strcmp(memory_str, "0.5") == 0) || (strcmp(memory_str, "0.50") == 0)) { + memory = UDS_MEMORY_CONFIG_512MB; + } else if (strcmp(memory_str, "0.75") == 0) { + memory = UDS_MEMORY_CONFIG_768MB; + } else { + unsigned int value; + int result; + + result = kstrtouint(memory_str, 10, &value); + if (result) { + vdo_log_error("optional parameter error: invalid memory size, must be a positive integer"); + return -EINVAL; + } + + if (value > UDS_MEMORY_CONFIG_MAX) { + vdo_log_error("optional parameter error: invalid memory size, must not be greater than %d", + UDS_MEMORY_CONFIG_MAX); + return -EINVAL; + } + + memory = value; + } + + *memory_ptr = memory; + return VDO_SUCCESS; +} + +/** + * parse_slab_size() - Parse a string option into a slab size value. + * @slab_str: The string value representing slab size. + * @slab_size_ptr: A pointer to return the slab size in. + * + * Return: VDO_SUCCESS or an error + */ +static int __must_check parse_slab_size(const char *slab_str, block_count_t *slab_size_ptr) +{ + block_count_t value; + int result; + + result = kstrtoull(slab_str, 10, &value); + if (result) { + vdo_log_error("optional parameter error: invalid slab size, must be a postive integer"); + return -EINVAL; + } + + if (value < MIN_VDO_SLAB_BLOCKS || value > MAX_VDO_SLAB_BLOCKS || (!is_power_of_2(value))) { + vdo_log_error("optional parameter error: invalid slab size, must be a power of two between %u and %u", + MIN_VDO_SLAB_BLOCKS, MAX_VDO_SLAB_BLOCKS); + return -EINVAL; + } + + *slab_size_ptr = value; + return VDO_SUCCESS; +} + +/** * process_one_thread_config_spec() - Process one component of a thread parameter configuration * string and update the configuration data structure. * @thread_param_type: The type of thread specified. @@ -568,7 +646,7 @@ static int process_one_key_value_pair(const char *key, unsigned int value, } /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */ if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { - vdo_log_error("optional parameter error: at most %d max discard blocks are allowed", + vdo_log_error("optional parameter error: at most %d max discard blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); return -EINVAL; } @@ -600,7 +678,16 @@ static int parse_one_key_value_pair(const char *key, const char *value, if (strcmp(key, "compression") == 0) return parse_bool(value, "on", "off", &config->compression); - /* The remaining arguments must have integral values. */ + if (strcmp(key, "indexSparse") == 0) + return parse_bool(value, "on", "off", &config->index_sparse); + + if (strcmp(key, "indexMemory") == 0) + return parse_memory(value, &config->index_memory); + + if (strcmp(key, "slabSize") == 0) + return parse_slab_size(value, &config->slab_blocks); + + /* The remaining arguments must have non-negative integral values. */ result = kstrtouint(value, 10, &count); if (result) { vdo_log_error("optional config string error: integer value needed, found \"%s\"", @@ -715,6 +802,12 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, struct device_config *config = NULL; int result; + if (logical_bytes > (MAXIMUM_VDO_LOGICAL_BLOCKS * VDO_BLOCK_SIZE)) { + handle_parse_error(config, error_ptr, + "Logical size exceeds the maximum"); + return VDO_BAD_CONFIGURATION; + } + if ((logical_bytes % VDO_BLOCK_SIZE) != 0) { handle_parse_error(config, error_ptr, "Logical size must be a multiple of 4096"); @@ -726,7 +819,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, return VDO_BAD_CONFIGURATION; } - result = vdo_allocate(1, struct device_config, "device_config", &config); + result = vdo_allocate(1, "device_config", &config); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Could not allocate config structure"); @@ -758,6 +851,9 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, config->max_discard_blocks = 1; config->deduplication = true; config->compression = false; + config->index_memory = UDS_MEMORY_CONFIG_256MB; + config->index_sparse = false; + config->slab_blocks = DEFAULT_VDO_SLAB_BLOCKS; arg_set.argc = argc; arg_set.argv = argv; @@ -783,7 +879,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the physical blocks, if known. */ if (config->version >= 1) { result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid physical block count"); return VDO_BAD_CONFIGURATION; @@ -804,7 +900,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the page cache size. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid block map page cache size"); return VDO_BAD_CONFIGURATION; @@ -812,7 +908,7 @@ static int parse_device_config(int argc, char **argv, struct dm_target *ti, /* Get the block map era length. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age); - if (result != VDO_SUCCESS) { + if (result) { handle_parse_error(config, error_ptr, "Invalid block map maximum age"); return VDO_BAD_CONFIGURATION; } @@ -1401,7 +1497,33 @@ static void pre_load_callback(struct vdo_completion *completion) vdo_continue_completion(completion, result); return; } + if (vdo->needs_formatting) + vdo->admin.phase = PRE_LOAD_PHASE_FORMAT_START; + else + vdo->admin.phase = PRE_LOAD_PHASE_LOAD_SUPER; + + vdo_continue_completion(completion, VDO_SUCCESS); + return; + + case PRE_LOAD_PHASE_FORMAT_START: + vdo_continue_completion(completion, vdo_clear_layout(vdo)); + return; + + case PRE_LOAD_PHASE_FORMAT_SUPER: + vdo_save_super_block(vdo, completion); + return; + + case PRE_LOAD_PHASE_FORMAT_GEOMETRY: + vdo_save_geometry_block(vdo, completion); + return; + + case PRE_LOAD_PHASE_FORMAT_END: + /* cleanup layout before load adds to it */ + vdo_uninitialize_layout(&vdo->states.layout); + vdo_continue_completion(completion, VDO_SUCCESS); + return; + case PRE_LOAD_PHASE_LOAD_SUPER: vdo_load_super_block(vdo, completion); return; @@ -1459,10 +1581,13 @@ static int vdo_initialize(struct dm_target *ti, unsigned int instance, vdo_log_debug("Logical blocks = %llu", logical_blocks); vdo_log_debug("Physical block size = %llu", (u64) block_size); vdo_log_debug("Physical blocks = %llu", config->physical_blocks); + vdo_log_debug("Slab size = %llu", config->slab_blocks); vdo_log_debug("Block map cache blocks = %u", config->cache_size); vdo_log_debug("Block map maximum age = %u", config->block_map_maximum_age); vdo_log_debug("Deduplication = %s", (config->deduplication ? "on" : "off")); vdo_log_debug("Compression = %s", (config->compression ? "on" : "off")); + vdo_log_debug("Index memory = %u", config->index_memory); + vdo_log_debug("Index sparse = %s", (config->index_sparse ? "on" : "off")); vdo = vdo_find_matching(vdo_uses_device, config); if (vdo != NULL) { @@ -2858,7 +2983,7 @@ static void vdo_resume(struct dm_target *ti) static struct target_type vdo_target_bio = { .features = DM_TARGET_SINGLETON, .name = "vdo", - .version = { 9, 1, 0 }, + .version = { 9, 2, 0 }, .module = THIS_MODULE, .ctr = vdo_ctr, .dtr = vdo_dtr, diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index bd60f4b3a0d0..d75e023df637 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -12,15 +12,10 @@ #include "permassert.h" #include "constants.h" +#include "indexer.h" #include "status-codes.h" #include "types.h" -/** The maximum logical space is 4 petabytes, which is 1 terablock. */ -static const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; - -/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ -static const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; - struct geometry_block { char magic_number[VDO_GEOMETRY_MAGIC_NUMBER_SIZE]; struct packed_header header; @@ -293,6 +288,62 @@ static void decode_volume_geometry(u8 *buffer, size_t *offset, } /** + * vdo_encode_volume_geometry() - Encode the on-disk representation of a volume geometry into a buffer. + * @buffer: A buffer to store the encoding. + * @geometry: The geometry to encode. + * @version: The geometry block version to encode. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry, + u32 version) +{ + int result; + enum volume_region_id id; + u32 checksum; + size_t offset = 0; + const struct header *header; + + memcpy(buffer, VDO_GEOMETRY_MAGIC_NUMBER, VDO_GEOMETRY_MAGIC_NUMBER_SIZE); + offset += VDO_GEOMETRY_MAGIC_NUMBER_SIZE; + + header = (version > 4) ? &GEOMETRY_BLOCK_HEADER_5_0 : &GEOMETRY_BLOCK_HEADER_4_0; + vdo_encode_header(buffer, &offset, header); + + /* This is for backwards compatibility */ + encode_u32_le(buffer, &offset, geometry->unused); + encode_u64_le(buffer, &offset, geometry->nonce); + memcpy(buffer + offset, (unsigned char *) &geometry->uuid, sizeof(uuid_t)); + offset += sizeof(uuid_t); + + if (version > 4) + encode_u64_le(buffer, &offset, geometry->bio_offset); + + for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) { + encode_u32_le(buffer, &offset, geometry->regions[id].id); + encode_u64_le(buffer, &offset, geometry->regions[id].start_block); + } + + encode_u32_le(buffer, &offset, geometry->index_config.mem); + encode_u32_le(buffer, &offset, 0); + + if (geometry->index_config.sparse) + buffer[offset++] = 1; + else + buffer[offset++] = 0; + + result = VDO_ASSERT(header->size == offset + sizeof(u32), + "should have encoded up to the geometry checksum"); + if (result != VDO_SUCCESS) + return result; + + checksum = vdo_crc32(buffer, offset); + encode_u32_le(buffer, &offset, checksum); + + return VDO_SUCCESS; +} + +/** * vdo_parse_geometry_block() - Decode and validate an encoded geometry block. * @block: The encoded geometry block. * @geometry: The structure to receive the decoded fields. @@ -798,7 +849,7 @@ static int allocate_partition(struct layout *layout, u8 id, struct partition *partition; int result; - result = vdo_allocate(1, struct partition, __func__, &partition); + result = vdo_allocate(1, __func__, &partition); if (result != VDO_SUCCESS) return result; @@ -1219,9 +1270,9 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_size <= (1 << MAX_VDO_SLAB_BITS), - "slab size must be less than or equal to 2^%d", - MAX_VDO_SLAB_BITS); + result = VDO_ASSERT(config->slab_size <= MAX_VDO_SLAB_BLOCKS, + "slab size must be a power of two less than or equal to %d", + MAX_VDO_SLAB_BLOCKS); if (result != VDO_SUCCESS) return result; @@ -1486,3 +1537,153 @@ int vdo_decode_super_block(u8 *buffer) return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); } + +/** + * vdo_initialize_component_states() - Initialize the components so they can be written out. + * @vdo_config: The config used for component state initialization. + * @geometry: The volume geometry used to calculate the data region offset. + * @nonce: The nonce to use to identify the vdo. + * @states: The component states to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_initialize_component_states(const struct vdo_config *vdo_config, + const struct volume_geometry *geometry, + nonce_t nonce, + struct vdo_component_states *states) +{ + int result; + struct slab_config slab_config; + struct partition *partition; + + states->vdo.config = *vdo_config; + states->vdo.nonce = nonce; + states->volume_version = VDO_VOLUME_VERSION_67_0; + + states->recovery_journal = (struct recovery_journal_state_7_0) { + .journal_start = RECOVERY_JOURNAL_STARTING_SEQUENCE_NUMBER, + .logical_blocks_used = 0, + .block_map_data_blocks = 0, + }; + + /* + * The layout starts 1 block past the beginning of the data region, as the + * data region contains the super block but the layout does not. + */ + result = vdo_initialize_layout(vdo_config->physical_blocks, + vdo_get_data_region_start(*geometry) + 1, + DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT, + vdo_config->recovery_journal_size, + VDO_SLAB_SUMMARY_BLOCKS, + &states->layout); + if (result != VDO_SUCCESS) + return result; + + result = vdo_configure_slab(vdo_config->slab_size, + vdo_config->slab_journal_blocks, + &slab_config); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_get_partition(&states->layout, VDO_SLAB_DEPOT_PARTITION, + &partition); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_configure_slab_depot(partition, slab_config, 0, + &states->slab_depot); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + result = vdo_get_partition(&states->layout, VDO_BLOCK_MAP_PARTITION, + &partition); + if (result != VDO_SUCCESS) { + vdo_uninitialize_layout(&states->layout); + return result; + } + + states->block_map = (struct block_map_state_2_0) { + .flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, + .flat_page_count = 0, + .root_origin = partition->offset, + .root_count = DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT, + }; + + states->vdo.state = VDO_NEW; + + return VDO_SUCCESS; +} + +/** + * vdo_compute_index_blocks() - Compute the number of blocks that the indexer will use. + * @config: The index config from which the blocks are calculated. + * @index_blocks_ptr: The number of blocks the index will use. + * + * Return: VDO_SUCCESS or an error code. + */ +static int vdo_compute_index_blocks(const struct index_config *config, + block_count_t *index_blocks_ptr) +{ + int result; + u64 index_bytes; + struct uds_parameters uds_parameters = { + .memory_size = config->mem, + .sparse = config->sparse, + }; + + result = uds_compute_index_size(&uds_parameters, &index_bytes); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "error computing index size"); + + *index_blocks_ptr = index_bytes / VDO_BLOCK_SIZE; + return VDO_SUCCESS; +} + +/** + * vdo_initialize_volume_geometry() - Initialize the volume geometry so it can be written out. + * @nonce: The nonce to use to identify the vdo. + * @uuid: The uuid to use to identify the vdo. + * @index_config: The config used for structure initialization. + * @geometry: The volume geometry to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid, + const struct index_config *index_config, + struct volume_geometry *geometry) +{ + int result; + block_count_t index_blocks = 0; + + result = vdo_compute_index_blocks(index_config, &index_blocks); + if (result != VDO_SUCCESS) + return result; + + *geometry = (struct volume_geometry) { + /* This is for backwards compatibility. */ + .unused = 0, + .nonce = nonce, + .bio_offset = 0, + .regions = { + [VDO_INDEX_REGION] = { + .id = VDO_INDEX_REGION, + .start_block = 1, + }, + [VDO_DATA_REGION] = { + .id = VDO_DATA_REGION, + .start_block = 1 + index_blocks, + } + } + }; + + memcpy(&(geometry->uuid), uuid, sizeof(uuid_t)); + memcpy(&geometry->index_config, index_config, sizeof(struct index_config)); + + return VDO_SUCCESS; +} diff --git a/drivers/md/dm-vdo/encodings.h b/drivers/md/dm-vdo/encodings.h index 87b7d2f3b545..67ff0ff2ffda 100644 --- a/drivers/md/dm-vdo/encodings.h +++ b/drivers/md/dm-vdo/encodings.h @@ -608,6 +608,12 @@ struct vdo_config { block_count_t slab_journal_blocks; /* number of slab journal blocks */ }; +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +#define MAXIMUM_VDO_LOGICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 1024)) + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +#define MAXIMUM_VDO_PHYSICAL_BLOCKS ((block_count_t)(1024ULL * 1024 * 1024 * 64)) + /* This is the structure that captures the vdo fields saved as a super block component. */ struct vdo_component { enum vdo_state state; @@ -803,6 +809,12 @@ vdo_get_index_region_size(struct volume_geometry geometry) vdo_get_index_region_start(geometry); } +int vdo_initialize_volume_geometry(nonce_t nonce, uuid_t *uuid, + const struct index_config *index_config, + struct volume_geometry *geometry); + +int vdo_encode_volume_geometry(u8 *buffer, const struct volume_geometry *geometry, + u32 version); int __must_check vdo_parse_geometry_block(unsigned char *block, struct volume_geometry *geometry); @@ -1264,6 +1276,11 @@ int __must_check vdo_validate_component_states(struct vdo_component_states *stat void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states); int __must_check vdo_decode_super_block(u8 *buffer); +int vdo_initialize_component_states(const struct vdo_config *vdo_config, + const struct volume_geometry *geometry, + nonce_t nonce, + struct vdo_component_states *states); + /* We start with 0L and postcondition with ~0L to match our historical usage in userspace. */ static inline u32 vdo_crc32(const void *buf, unsigned long len) { diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c index 82a259ef1601..6c1610ba91b6 100644 --- a/drivers/md/dm-vdo/flush.c +++ b/drivers/md/dm-vdo/flush.c @@ -105,7 +105,7 @@ static void *allocate_flush(gfp_t gfp_mask, void *pool_data) if ((gfp_mask & GFP_NOWAIT) == GFP_NOWAIT) { flush = vdo_allocate_memory_nowait(sizeof(struct vdo_flush), __func__); } else { - int result = vdo_allocate(1, struct vdo_flush, __func__, &flush); + int result = vdo_allocate(1, __func__, &flush); if (result != VDO_SUCCESS) vdo_log_error_strerror(result, "failed to allocate spare flush"); @@ -134,7 +134,7 @@ static void free_flush(void *element, void *pool_data __always_unused) */ int vdo_make_flusher(struct vdo *vdo) { - int result = vdo_allocate(1, struct flusher, __func__, &vdo->flusher); + int result = vdo_allocate(1, __func__, &vdo->flusher); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/funnel-queue.c b/drivers/md/dm-vdo/funnel-queue.c index a63b2f2bfd7d..7011963c9073 100644 --- a/drivers/md/dm-vdo/funnel-queue.c +++ b/drivers/md/dm-vdo/funnel-queue.c @@ -14,7 +14,7 @@ int vdo_make_funnel_queue(struct funnel_queue **queue_ptr) int result; struct funnel_queue *queue; - result = vdo_allocate(1, struct funnel_queue, "funnel queue", &queue); + result = vdo_allocate(1, "funnel queue", &queue); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index 8a79b33b8b09..62d300f70de9 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -322,7 +322,7 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na "queue priority count %u within limit %u", type->max_priority, VDO_WORK_Q_MAX_PRIORITY); - result = vdo_allocate(1, struct simple_work_queue, "simple work queue", &queue); + result = vdo_allocate(1, "simple work queue", &queue); if (result != VDO_SUCCESS) return result; @@ -405,13 +405,11 @@ int vdo_make_work_queue(const char *thread_name_prefix, const char *name, return result; } - result = vdo_allocate(1, struct round_robin_work_queue, "round-robin work queue", - &queue); + result = vdo_allocate(1, "round-robin work queue", &queue); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(thread_count, struct simple_work_queue *, - "subordinate work queues", &queue->service_queues); + result = vdo_allocate(thread_count, "subordinate work queues", &queue->service_queues); if (result != VDO_SUCCESS) { vdo_free(queue); return result; diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c index fb1db41c794b..bb3b0ab5d50d 100644 --- a/drivers/md/dm-vdo/indexer/chapter-index.c +++ b/drivers/md/dm-vdo/indexer/chapter-index.c @@ -20,7 +20,7 @@ int uds_make_open_chapter_index(struct open_chapter_index **chapter_index, size_t memory_size; struct open_chapter_index *index; - result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index); + result = vdo_allocate(1, "open chapter index", &index); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c index 5532371b952f..4a2cc66cfd60 100644 --- a/drivers/md/dm-vdo/indexer/config.c +++ b/drivers/md/dm-vdo/indexer/config.c @@ -325,7 +325,7 @@ int uds_make_configuration(const struct uds_parameters *params, if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct uds_configuration, __func__, &config); + result = vdo_allocate(1, __func__, &config); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c index 0ac2443f0df3..b288749067de 100644 --- a/drivers/md/dm-vdo/indexer/delta-index.c +++ b/drivers/md/dm-vdo/indexer/delta-index.c @@ -311,18 +311,16 @@ static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size, { int result; - result = vdo_allocate(size, u8, "delta list", &delta_zone->memory); + result = vdo_allocate(size, "delta list", &delta_zone->memory); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(list_count + 2, u64, "delta list temp", - &delta_zone->new_offsets); + result = vdo_allocate(list_count + 2, "delta list temp", &delta_zone->new_offsets); if (result != VDO_SUCCESS) return result; /* Allocate the delta lists. */ - result = vdo_allocate(list_count + 2, struct delta_list, "delta lists", - &delta_zone->delta_lists); + result = vdo_allocate(list_count + 2, "delta lists", &delta_zone->delta_lists); if (result != VDO_SUCCESS) return result; @@ -352,8 +350,7 @@ int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zon unsigned int z; size_t zone_memory; - result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones", - &delta_index->delta_zones); + result = vdo_allocate(zone_count, "Delta Index Zones", &delta_index->delta_zones); if (result != VDO_SUCCESS) return result; @@ -1047,7 +1044,7 @@ int uds_finish_restoring_delta_index(struct delta_index *delta_index, unsigned int z; u8 *data; - result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data); + result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, __func__, &data); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c index 1a5735375ddc..03797cf87b91 100644 --- a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c @@ -198,7 +198,7 @@ int uds_make_request_queue(const char *queue_name, int result; struct uds_request_queue *queue; - result = vdo_allocate(1, struct uds_request_queue, __func__, &queue); + result = vdo_allocate(1, __func__, &queue); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c index c0575612e820..49f122a223d5 100644 --- a/drivers/md/dm-vdo/indexer/geometry.c +++ b/drivers/md/dm-vdo/indexer/geometry.c @@ -61,7 +61,7 @@ int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, int result; struct index_geometry *geometry; - result = vdo_allocate(1, struct index_geometry, "geometry", &geometry); + result = vdo_allocate(1, "geometry", &geometry); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index 61edf2b72427..5f4ce4ab1b1e 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -249,6 +249,32 @@ static int __must_check compute_sizes(const struct uds_configuration *config, return UDS_SUCCESS; } +int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size) +{ + int result; + struct uds_configuration *index_config; + struct save_layout_sizes sizes; + + if (index_size == NULL) { + vdo_log_error("Missing output size pointer"); + return -EINVAL; + } + + result = uds_make_configuration(parameters, &index_config); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "cannot compute index size"); + return result; + } + + result = compute_sizes(index_config, &sizes); + uds_free_configuration(index_config); + if (result != UDS_SUCCESS) + return result; + + *index_size = sizes.total_size; + return UDS_SUCCESS; +} + /* Create unique data using the current time and a pseudorandom number. */ static void create_unique_nonce_data(u8 *buffer) { @@ -459,8 +485,7 @@ static int __must_check make_index_save_region_table(struct index_save_layout *i type = RH_TYPE_UNSAVED; } - result = vdo_allocate_extended(struct region_table, region_count, - struct layout_region, + result = vdo_allocate_extended(region_count, regions, "layout region table for ISL", &table); if (result != VDO_SUCCESS) return result; @@ -520,7 +545,7 @@ static int __must_check write_index_save_header(struct index_save_layout *isl, u8 *buffer; size_t offset = 0; - result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer); + result = vdo_allocate(table->encoded_size, "index save data", &buffer); if (result != VDO_SUCCESS) return result; @@ -642,9 +667,8 @@ static int __must_check make_layout_region_table(struct index_layout *layout, struct region_table *table; struct layout_region *lr; - result = vdo_allocate_extended(struct region_table, region_count, - struct layout_region, "layout region table", - &table); + result = vdo_allocate_extended(region_count, regions, + "layout region table", &table); if (result != VDO_SUCCESS) return result; @@ -690,7 +714,7 @@ static int __must_check write_layout_header(struct index_layout *layout, u8 *buffer; size_t offset = 0; - result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer); + result = vdo_allocate(table->encoded_size, "layout data", &buffer); if (result != VDO_SUCCESS) return result; @@ -780,8 +804,7 @@ static int create_index_layout(struct index_layout *layout, struct uds_configura if (result != UDS_SUCCESS) return result; - result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__, - &layout->index.saves); + result = vdo_allocate(sizes.save_count, __func__, &layout->index.saves); if (result != VDO_SUCCESS) return result; @@ -1138,8 +1161,7 @@ static int __must_check load_region_table(struct buffered_reader *reader, header.version); } - result = vdo_allocate_extended(struct region_table, header.region_count, - struct layout_region, + result = vdo_allocate_extended(header.region_count, regions, "single file layout region table", &table); if (result != VDO_SUCCESS) return result; @@ -1177,7 +1199,7 @@ static int __must_check read_super_block_data(struct buffered_reader *reader, u8 *buffer; size_t offset = 0; - result = vdo_allocate(saved_size, u8, "super block data", &buffer); + result = vdo_allocate(saved_size, "super block data", &buffer); if (result != VDO_SUCCESS) return result; @@ -1311,8 +1333,7 @@ static int __must_check reconstitute_layout(struct index_layout *layout, int result; u64 next_block = first_block; - result = vdo_allocate(layout->super.max_saves, struct index_save_layout, - __func__, &layout->index.saves); + result = vdo_allocate(layout->super.max_saves, __func__, &layout->index.saves); if (result != VDO_SUCCESS) return result; @@ -1445,6 +1466,9 @@ static int __must_check reconstruct_index_save(struct index_save_layout *isl, u64 last_block = next_block + isl->index_save.block_count; isl->zone_count = table->header.region_count - 3; + if (isl->zone_count > MAX_ZONES) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "invalid zone count"); last_region = &table->regions[table->header.region_count - 1]; if (last_region->kind == RL_KIND_EMPTY) { @@ -1672,7 +1696,7 @@ int uds_make_index_layout(struct uds_configuration *config, bool new_layout, if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct index_layout, __func__, &layout); + result = vdo_allocate(1, __func__, &layout); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c index 00b44e07d0c1..1d45d466d07f 100644 --- a/drivers/md/dm-vdo/indexer/index-page-map.c +++ b/drivers/md/dm-vdo/indexer/index-page-map.c @@ -38,13 +38,13 @@ int uds_make_index_page_map(const struct index_geometry *geometry, int result; struct index_page_map *map; - result = vdo_allocate(1, struct index_page_map, "page map", &map); + result = vdo_allocate(1, "page map", &map); if (result != VDO_SUCCESS) return result; map->geometry = geometry; map->entries_per_chapter = geometry->index_pages_per_chapter - 1; - result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries", + result = vdo_allocate(get_entry_count(geometry), "Index Page Map Entries", &map->entries); if (result != VDO_SUCCESS) { uds_free_index_page_map(map); @@ -118,7 +118,7 @@ int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); u32 i; - result = vdo_allocate(saved_size, u8, "page map data", &buffer); + result = vdo_allocate(saved_size, "page map data", &buffer); if (result != VDO_SUCCESS) return result; @@ -145,7 +145,7 @@ int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader * u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); u32 i; - result = vdo_allocate(saved_size, u8, "page map data", &buffer); + result = vdo_allocate(saved_size, "page map data", &buffer); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aa575a24e0b2..6c78070e1a05 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -217,7 +217,7 @@ static int __must_check make_empty_index_session(struct uds_index_session **inde int result; struct uds_index_session *session; - result = vdo_allocate(1, struct uds_index_session, __func__, &session); + result = vdo_allocate(1, __func__, &session); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c index df4934846244..793bd32c1179 100644 --- a/drivers/md/dm-vdo/indexer/index.c +++ b/drivers/md/dm-vdo/indexer/index.c @@ -88,7 +88,7 @@ static int launch_zone_message(struct uds_zone_message message, unsigned int zon int result; struct uds_request *request; - result = vdo_allocate(1, struct uds_request, __func__, &request); + result = vdo_allocate(1, __func__, &request); if (result != VDO_SUCCESS) return result; @@ -764,9 +764,7 @@ static int make_chapter_writer(struct uds_index *index, size_t collated_records_size = (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); - result = vdo_allocate_extended(struct chapter_writer, index->zone_count, - struct open_chapter_zone *, "Chapter Writer", - &writer); + result = vdo_allocate_extended(index->zone_count, chapters, "Chapter Writer", &writer); if (result != VDO_SUCCESS) return result; @@ -1123,7 +1121,7 @@ static int make_index_zone(struct uds_index *index, unsigned int zone_number) int result; struct index_zone *zone; - result = vdo_allocate(1, struct index_zone, "index zone", &zone); + result = vdo_allocate(1, "index zone", &zone); if (result != VDO_SUCCESS) return result; @@ -1160,8 +1158,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op u64 nonce; unsigned int z; - result = vdo_allocate_extended(struct uds_index, config->zone_count, - struct uds_request_queue *, "index", &index); + result = vdo_allocate_extended(config->zone_count, zone_queues, "index", &index); if (result != VDO_SUCCESS) return result; @@ -1173,8 +1170,7 @@ int uds_make_index(struct uds_configuration *config, enum uds_open_index_type op return result; } - result = vdo_allocate(index->zone_count, struct index_zone *, "zones", - &index->zones); + result = vdo_allocate(index->zone_count, "zones", &index->zones); if (result != VDO_SUCCESS) { uds_free_index(index); return result; diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h index edabb239548e..1891f2de508e 100644 --- a/drivers/md/dm-vdo/indexer/index.h +++ b/drivers/md/dm-vdo/indexer/index.h @@ -53,7 +53,7 @@ struct uds_index { index_callback_fn callback; struct uds_request_queue *triage_queue; - struct uds_request_queue *zone_queues[]; + struct uds_request_queue *zone_queues[] __counted_by(zone_count); }; enum request_stage { diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 7c1fc4577f5b..d765f24328eb 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -282,6 +282,10 @@ struct uds_request { ); }; +/* Compute the number of bytes needed to store an index. */ +int __must_check uds_compute_index_size(const struct uds_parameters *parameters, + u64 *index_size); + /* A session is required for most index operations. */ int __must_check uds_create_index_session(struct uds_index_session **session); diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c index 1bee9d63dc0a..f42861372030 100644 --- a/drivers/md/dm-vdo/indexer/io-factory.c +++ b/drivers/md/dm-vdo/indexer/io-factory.c @@ -64,7 +64,7 @@ int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_p int result; struct io_factory *factory; - result = vdo_allocate(1, struct io_factory, __func__, &factory); + result = vdo_allocate(1, __func__, &factory); if (result != VDO_SUCCESS) return result; @@ -144,7 +144,7 @@ int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader); + result = vdo_allocate(1, "buffered reader", &reader); if (result != VDO_SUCCESS) { dm_bufio_client_destroy(client); return result; @@ -282,7 +282,7 @@ int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block if (result != UDS_SUCCESS) return result; - result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer); + result = vdo_allocate(1, "buffered writer", &writer); if (result != VDO_SUCCESS) { dm_bufio_client_destroy(client); return result; diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c index 4a67bcadaae0..89b91c600bfd 100644 --- a/drivers/md/dm-vdo/indexer/open-chapter.c +++ b/drivers/md/dm-vdo/indexer/open-chapter.c @@ -68,9 +68,7 @@ int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zo size_t capacity = geometry->records_per_chapter / zone_count; size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO)); - result = vdo_allocate_extended(struct open_chapter_zone, slot_count, - struct open_chapter_zone_slot, "open chapter", - &open_chapter); + result = vdo_allocate_extended(slot_count, slots, "open chapter", &open_chapter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h index a4250bb19525..ea6d7336aea0 100644 --- a/drivers/md/dm-vdo/indexer/open-chapter.h +++ b/drivers/md/dm-vdo/indexer/open-chapter.h @@ -40,7 +40,7 @@ struct open_chapter_zone { /* The number of slots in the hash table */ unsigned int slot_count; /* The hash table slots, referencing virtual record numbers */ - struct open_chapter_zone_slot slots[]; + struct open_chapter_zone_slot slots[] __counted_by(slot_count); }; int __must_check uds_make_open_chapter(const struct index_geometry *geometry, diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c index 66b8c706a1ef..4b81e130d18a 100644 --- a/drivers/md/dm-vdo/indexer/radix-sort.c +++ b/drivers/md/dm-vdo/indexer/radix-sort.c @@ -211,8 +211,7 @@ int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; struct radix_sorter *radix_sorter; - result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task, - __func__, &radix_sorter); + result = vdo_allocate_extended(stack_size, stack, __func__, &radix_sorter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c index 28920167827c..eb62d3f01834 100644 --- a/drivers/md/dm-vdo/indexer/sparse-cache.c +++ b/drivers/md/dm-vdo/indexer/sparse-cache.c @@ -222,13 +222,12 @@ static int __must_check initialize_cached_chapter_index(struct cached_chapter_in chapter->virtual_chapter = NO_CHAPTER; chapter->index_pages_count = geometry->index_pages_per_chapter; - result = vdo_allocate(chapter->index_pages_count, struct delta_index_page, - __func__, &chapter->index_pages); + result = vdo_allocate(chapter->index_pages_count, __func__, &chapter->index_pages); if (result != VDO_SUCCESS) return result; - return vdo_allocate(chapter->index_pages_count, struct dm_buffer *, - "sparse index volume pages", &chapter->page_buffers); + return vdo_allocate(chapter->index_pages_count, "sparse index volume pages", + &chapter->page_buffers); } static int __must_check make_search_list(struct sparse_cache *cache, @@ -294,8 +293,7 @@ int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int ca } /* purge_search_list() needs some temporary lists for sorting. */ - result = vdo_allocate(capacity * 2, struct cached_chapter_index *, - "scratch entries", &cache->scratch_entries); + result = vdo_allocate(capacity * 2, "scratch entries", &cache->scratch_entries); if (result != VDO_SUCCESS) goto out; diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c index afb062e1f1fb..e78d2725ce8b 100644 --- a/drivers/md/dm-vdo/indexer/volume-index.c +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -1211,13 +1211,12 @@ static int initialize_volume_sub_index(const struct uds_configuration *config, (zone_count * sizeof(struct volume_sub_index_zone))); /* The following arrays are initialized to all zeros. */ - result = vdo_allocate(params.list_count, u64, "first chapter to flush", + result = vdo_allocate(params.list_count, "first chapter to flush", &sub_index->flush_chapters); if (result != VDO_SUCCESS) return result; - return vdo_allocate(zone_count, struct volume_sub_index_zone, - "volume index zones", &sub_index->zones); + return vdo_allocate(zone_count, "volume index zones", &sub_index->zones); } int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, @@ -1228,7 +1227,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non struct volume_index *volume_index; int result; - result = vdo_allocate(1, struct volume_index, "volume index", &volume_index); + result = vdo_allocate(1, "volume index", &volume_index); if (result != VDO_SUCCESS) return result; @@ -1249,8 +1248,7 @@ int uds_make_volume_index(const struct uds_configuration *config, u64 volume_non volume_index->sparse_sample_rate = config->sparse_sample_rate; - result = vdo_allocate(config->zone_count, struct volume_index_zone, - "volume index zones", &volume_index->zones); + result = vdo_allocate(config->zone_count, "volume index zones", &volume_index->zones); if (result != VDO_SUCCESS) { uds_free_volume_index(volume_index); return result; diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c index 425b3a74f4db..af97c0cbeede 100644 --- a/drivers/md/dm-vdo/indexer/volume.c +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -1509,23 +1509,21 @@ static int __must_check initialize_page_cache(struct page_cache *cache, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, - "volume read queue", &cache->read_queue); + result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, "volume read queue", + &cache->read_queue); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->zone_count, struct search_pending_counter, - "Volume Cache Zones", &cache->search_pending_counters); + result = vdo_allocate(cache->zone_count, "Volume Cache Zones", + &cache->search_pending_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->indexable_pages, u16, "page cache index", - &cache->index); + result = vdo_allocate(cache->indexable_pages, "page cache index", &cache->index); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache", - &cache->cache); + result = vdo_allocate(cache->cache_slots, "page cache cache", &cache->cache); if (result != VDO_SUCCESS) return result; @@ -1548,7 +1546,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout unsigned int reserved_buffers; int result; - result = vdo_allocate(1, struct volume, "volume", &volume); + result = vdo_allocate(1, "volume", &volume); if (result != VDO_SUCCESS) return result; @@ -1585,8 +1583,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout return result; } - result = vdo_allocate(geometry->records_per_page, - const struct uds_volume_record *, "record pointers", + result = vdo_allocate(geometry->records_per_page, "record pointers", &volume->record_pointers); if (result != VDO_SUCCESS) { uds_free_volume(volume); @@ -1626,8 +1623,7 @@ int uds_make_volume(const struct uds_configuration *config, struct index_layout uds_init_cond(&volume->read_threads_read_done_cond); uds_init_cond(&volume->read_threads_cond); - result = vdo_allocate(config->read_threads, struct thread *, "reader threads", - &volume->reader_threads); + result = vdo_allocate(config->read_threads, "reader threads", &volume->reader_threads); if (result != VDO_SUCCESS) { uds_free_volume(volume); return result; diff --git a/drivers/md/dm-vdo/int-map.c b/drivers/md/dm-vdo/int-map.c index aeb690415dbd..28d8af1f9be2 100644 --- a/drivers/md/dm-vdo/int-map.c +++ b/drivers/md/dm-vdo/int-map.c @@ -164,8 +164,7 @@ static int allocate_buckets(struct int_map *map, size_t capacity) * without have to wrap back around to element zero. */ map->bucket_count = capacity + (NEIGHBORHOOD - 1); - return vdo_allocate(map->bucket_count, struct bucket, - "struct int_map buckets", &map->buckets); + return vdo_allocate(map->bucket_count, "struct int_map buckets", &map->buckets); } /** @@ -182,7 +181,7 @@ int vdo_int_map_create(size_t initial_capacity, struct int_map **map_ptr) int result; size_t capacity; - result = vdo_allocate(1, struct int_map, "struct int_map", &map); + result = vdo_allocate(1, "struct int_map", &map); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index e26d75f8366d..0916c8609543 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -365,6 +365,33 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, } /** + * vdo_submit_metadata_vio_wait() - Submit I/O for a metadata vio and wait for completion. + * @vio: the vio for which to issue I/O + * @physical: the physical block number to read or write + * @operation: the type of I/O to perform + * + * The function operates similarly to __submit_metadata_vio except that it will + * block until the work is done. It can be used to do i/o before work queues + * and thread completions are set up. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_submit_metadata_vio_wait(struct vio *vio, + physical_block_number_t physical, + blk_opf_t operation) +{ + int result; + + result = vio_reset_bio(vio, vio->data, NULL, operation | REQ_META, physical); + if (result != VDO_SUCCESS) + return result; + + bio_set_dev(vio->bio, vdo_get_backing_device(vio->completion.vdo)); + submit_bio_wait(vio->bio); + return blk_status_to_errno(vio->bio->bi_status); +} + +/** * vdo_make_io_submitter() - Create an io_submitter structure. * @thread_count: Number of bio-submission threads to set up. * @rotation_interval: Interval to use when rotating between bio-submission threads when enqueuing @@ -383,8 +410,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter struct io_submitter *io_submitter; int result; - result = vdo_allocate_extended(struct io_submitter, thread_count, - struct bio_queue_data, "bio submission data", + result = vdo_allocate_extended(thread_count, bio_queue_data, "bio submission data", &io_submitter); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 3088f11055fd..0f320a60e9e8 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -56,4 +56,8 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } +int vdo_submit_metadata_vio_wait(struct vio *vio, + physical_block_number_t physical, + blk_opf_t operation); + #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c index 0a27e60a9dfd..fa7c3eb7ee6b 100644 --- a/drivers/md/dm-vdo/logical-zone.c +++ b/drivers/md/dm-vdo/logical-zone.c @@ -94,8 +94,7 @@ int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct logical_zones, zone_count, - struct logical_zone, __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/logical-zone.h b/drivers/md/dm-vdo/logical-zone.h index 1b666c84a193..a36a864c6836 100644 --- a/drivers/md/dm-vdo/logical-zone.h +++ b/drivers/md/dm-vdo/logical-zone.h @@ -60,7 +60,7 @@ struct logical_zones { /* The number of zones */ zone_count_t zone_count; /* The logical zones themselves */ - struct logical_zone zones[]; + struct logical_zone zones[] __counted_by(zone_count); }; int __must_check vdo_make_logical_zones(struct vdo *vdo, diff --git a/drivers/md/dm-vdo/memory-alloc.c b/drivers/md/dm-vdo/memory-alloc.c index 185f259c7245..a7f07522110d 100644 --- a/drivers/md/dm-vdo/memory-alloc.c +++ b/drivers/md/dm-vdo/memory-alloc.c @@ -245,7 +245,7 @@ int vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr) } else { struct vmalloc_block_info *block; - if (vdo_allocate(1, struct vmalloc_block_info, __func__, &block) == VDO_SUCCESS) { + if (vdo_allocate(1, __func__, &block) == VDO_SUCCESS) { /* * It is possible for __vmalloc to fail to allocate memory because there * are no pages available. A short sleep may allow the page reclaimer @@ -341,6 +341,7 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w void *new_ptr) { int result; + char *temp_ptr; if (size == 0) { vdo_free(ptr); @@ -348,9 +349,10 @@ int vdo_reallocate_memory(void *ptr, size_t old_size, size_t size, const char *w return VDO_SUCCESS; } - result = vdo_allocate(size, char, what, new_ptr); + result = vdo_allocate(size, what, &temp_ptr); if (result != VDO_SUCCESS) return result; + *(void **) new_ptr = temp_ptr; if (ptr != NULL) { if (old_size < size) @@ -368,7 +370,7 @@ int vdo_duplicate_string(const char *string, const char *what, char **new_string int result; u8 *dup; - result = vdo_allocate(strlen(string) + 1, u8, what, &dup); + result = vdo_allocate(strlen(string) + 1, what, &dup); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/memory-alloc.h b/drivers/md/dm-vdo/memory-alloc.h index 0093d9f940d9..bc5527327ed8 100644 --- a/drivers/md/dm-vdo/memory-alloc.h +++ b/drivers/md/dm-vdo/memory-alloc.h @@ -8,6 +8,7 @@ #include <linux/cache.h> #include <linux/io.h> /* for PAGE_SIZE */ +#include <linux/overflow.h> #include "permassert.h" #include "thread-registry.h" @@ -16,86 +17,35 @@ int __must_check vdo_allocate_memory(size_t size, size_t align, const char *what, void *ptr); /* - * Allocate storage based on element counts, sizes, and alignment. - * - * This is a generalized form of our allocation use case: It allocates an array of objects, - * optionally preceded by one object of another type (i.e., a struct with trailing variable-length - * array), with the alignment indicated. - * - * Why is this inline? The sizes and alignment will always be constant, when invoked through the - * macros below, and often the count will be a compile-time constant 1 or the number of extra bytes - * will be a compile-time constant 0. So at least some of the arithmetic can usually be optimized - * away, and the run-time selection between allocation functions always can. In many cases, it'll - * boil down to just a function call with a constant size. - * - * @count: The number of objects to allocate - * @size: The size of an object - * @extra: The number of additional bytes to allocate - * @align: The required alignment - * @what: What is being allocated (for error logging) - * @ptr: A pointer to hold the allocated memory - * - * Return: VDO_SUCCESS or an error code - */ -static inline int __vdo_do_allocation(size_t count, size_t size, size_t extra, - size_t align, const char *what, void *ptr) -{ - size_t total_size = count * size + extra; - - /* Overflow check: */ - if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { - /* - * This is kind of a hack: We rely on the fact that SIZE_MAX would cover the entire - * address space (minus one byte) and thus the system can never allocate that much - * and the call will always fail. So we can report an overflow as "out of memory" - * by asking for "merely" SIZE_MAX bytes. - */ - total_size = SIZE_MAX; - } - - return vdo_allocate_memory(total_size, align, what, ptr); -} - -/* * Allocate one or more elements of the indicated type, logging an error if the allocation fails. * The memory will be zeroed. * * @COUNT: The number of objects to allocate - * @TYPE: The type of objects to allocate. This type determines the alignment of the allocation. * @WHAT: What is being allocated (for error logging) * @PTR: A pointer to hold the allocated memory * * Return: VDO_SUCCESS or an error code */ -#define vdo_allocate(COUNT, TYPE, WHAT, PTR) \ - __vdo_do_allocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) +#define vdo_allocate(COUNT, WHAT, PTR) \ + vdo_allocate_memory(size_mul((COUNT), sizeof(typeof(**(PTR)))), \ + __alignof__(typeof(**(PTR))), WHAT, PTR) /* - * Allocate one object of an indicated type, followed by one or more elements of a second type, - * logging an error if the allocation fails. The memory will be zeroed. + * Allocate a structure with a flexible array member, with a specified number of elements, logging + * an error if the allocation fails. The memory will be zeroed. * - * @TYPE1: The type of the primary object to allocate. This type determines the alignment of the - * allocated memory. * @COUNT: The number of objects to allocate - * @TYPE2: The type of array objects to allocate + * @FIELD: The flexible array field at the end of the structure * @WHAT: What is being allocated (for error logging) * @PTR: A pointer to hold the allocated memory * * Return: VDO_SUCCESS or an error code */ -#define vdo_allocate_extended(TYPE1, COUNT, TYPE2, WHAT, PTR) \ - __extension__({ \ - int _result; \ - TYPE1 **_ptr = (PTR); \ - BUILD_BUG_ON(__alignof__(TYPE1) < __alignof__(TYPE2)); \ - _result = __vdo_do_allocation(COUNT, \ - sizeof(TYPE2), \ - sizeof(TYPE1), \ - __alignof__(TYPE1), \ - WHAT, \ - _ptr); \ - _result; \ - }) +#define vdo_allocate_extended(COUNT, FIELD, WHAT, PTR) \ + vdo_allocate_memory(struct_size(*(PTR), FIELD, (COUNT)), \ + __alignof__(typeof(**(PTR))), \ + WHAT, \ + (PTR)) /* * Allocate memory starting on a cache line boundary, logging an error if the allocation fails. The diff --git a/drivers/md/dm-vdo/message-stats.c b/drivers/md/dm-vdo/message-stats.c index 75dfcd7c5f63..b4c919780c22 100644 --- a/drivers/md/dm-vdo/message-stats.c +++ b/drivers/md/dm-vdo/message-stats.c @@ -420,7 +420,7 @@ int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen) struct vdo_statistics *stats; int result; - result = vdo_allocate(1, struct vdo_statistics, __func__, &stats); + result = vdo_allocate(1, __func__, &stats); if (result != VDO_SUCCESS) { vdo_log_error("Cannot allocate memory to write VDO statistics"); return result; diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c index 666be6d557e1..ea2d8d14495c 100644 --- a/drivers/md/dm-vdo/packer.c +++ b/drivers/md/dm-vdo/packer.c @@ -120,8 +120,7 @@ static int __must_check make_bin(struct packer *packer) struct packer_bin *bin; int result; - result = vdo_allocate_extended(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS, - struct vio *, __func__, &bin); + result = vdo_allocate_extended(VDO_MAX_COMPRESSION_SLOTS, incoming, __func__, &bin); if (result != VDO_SUCCESS) return result; @@ -146,7 +145,7 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa block_count_t i; int result; - result = vdo_allocate(1, struct packer, __func__, &packer); + result = vdo_allocate(1, __func__, &packer); if (result != VDO_SUCCESS) return result; @@ -168,8 +167,8 @@ int vdo_make_packer(struct vdo *vdo, block_count_t bin_count, struct packer **pa * bin must have a canceler for which it is waiting, and any canceler will only have * canceled one lock holder at a time. */ - result = vdo_allocate_extended(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2, - struct vio *, __func__, &packer->canceled_bin); + result = vdo_allocate_extended(MAXIMUM_VDO_USER_VIOS / 2, incoming, __func__, + &packer->canceled_bin); if (result != VDO_SUCCESS) { vdo_free_packer(packer); return result; diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c index 686eb7d714e6..d6ad8f1a33bb 100644 --- a/drivers/md/dm-vdo/physical-zone.c +++ b/drivers/md/dm-vdo/physical-zone.c @@ -200,7 +200,7 @@ struct pbn_lock_pool { /** @idle_list: A list containing all idle PBN lock instances. */ struct list_head idle_list; /** @locks: The memory for all the locks allocated by this pool. */ - idle_pbn_lock locks[]; + idle_pbn_lock locks[] __counted_by(capacity); }; /** @@ -240,8 +240,7 @@ static int make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr) struct pbn_lock_pool *pool; int result; - result = vdo_allocate_extended(struct pbn_lock_pool, capacity, idle_pbn_lock, - __func__, &pool); + result = vdo_allocate_extended(capacity, locks, __func__, &pool); if (result != VDO_SUCCESS) return result; @@ -368,8 +367,7 @@ int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr) if (zone_count == 0) return VDO_SUCCESS; - result = vdo_allocate_extended(struct physical_zones, zone_count, - struct physical_zone, __func__, &zones); + result = vdo_allocate_extended(zone_count, zones, __func__, &zones); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 9bae8256ba4e..bb8a878ce4e5 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -60,8 +60,7 @@ int vdo_make_priority_table(unsigned int max_priority, struct priority_table **t if (max_priority > MAX_PRIORITY) return UDS_INVALID_ARGUMENT; - result = vdo_allocate_extended(struct priority_table, max_priority + 1, - struct bucket, __func__, &table); + result = vdo_allocate_extended(max_priority + 1, buckets, __func__, &table); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c index 9cc0f0ff1664..f03939cc89e3 100644 --- a/drivers/md/dm-vdo/recovery-journal.c +++ b/drivers/md/dm-vdo/recovery-journal.c @@ -593,32 +593,29 @@ static int __must_check initialize_lock_counter(struct recovery_journal *journal struct thread_config *config = &vdo->thread_config; struct lock_counter *counter = &journal->lock_counter; - result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters); + result = vdo_allocate(journal->size, __func__, &counter->journal_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->journal_decrement_counts); + result = vdo_allocate(journal->size, __func__, &counter->journal_decrement_counts); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__, + result = vdo_allocate(journal->size * config->logical_zone_count, __func__, &counter->logical_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->logical_zone_counts); + result = vdo_allocate(journal->size, __func__, &counter->logical_zone_counts); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__, + result = vdo_allocate(journal->size * config->physical_zone_count, __func__, &counter->physical_counters); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(journal->size, atomic_t, __func__, - &counter->physical_zone_counts); + result = vdo_allocate(journal->size, __func__, &counter->physical_zone_counts); if (result != VDO_SUCCESS) return result; @@ -672,7 +669,7 @@ static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *j * Allocate a full block for the journal block even though not all of the space is used * since the VIO needs to write a full disk block. */ - result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data); + result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &data); if (result != VDO_SUCCESS) return result; @@ -711,10 +708,8 @@ int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t struct recovery_journal *journal; int result; - result = vdo_allocate_extended(struct recovery_journal, - RECOVERY_JOURNAL_RESERVED_BLOCKS, - struct recovery_journal_block, __func__, - &journal); + result = vdo_allocate_extended(RECOVERY_JOURNAL_RESERVED_BLOCKS, blocks, + __func__, &journal); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/repair.c b/drivers/md/dm-vdo/repair.c index 8c006fb3afcf..bfed62260280 100644 --- a/drivers/md/dm-vdo/repair.c +++ b/drivers/md/dm-vdo/repair.c @@ -127,7 +127,7 @@ struct repair_completion { * The page completions used for playing the journal into the block map, and, during * read-only rebuild, for rebuilding the reference counts from the block map. */ - struct vdo_page_completion page_completions[]; + struct vdo_page_completion page_completions[] __counted_by(page_count); }; /* @@ -1417,8 +1417,7 @@ static int parse_journal_for_rebuild(struct repair_completion *repair) * packed_recovery_journal_entry from every valid journal block. */ count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block); - result = vdo_allocate(count, struct numbered_block_mapping, __func__, - &repair->entries); + result = vdo_allocate(count, __func__, &repair->entries); if (result != VDO_SUCCESS) return result; @@ -1464,8 +1463,7 @@ static int extract_new_mappings(struct repair_completion *repair) * Allocate an array of numbered_block_mapping structs just large enough to transcribe * every packed_recovery_journal_entry from every valid journal block. */ - result = vdo_allocate(repair->entry_count, struct numbered_block_mapping, - __func__, &repair->entries); + result = vdo_allocate(repair->entry_count, __func__, &repair->entries); if (result != VDO_SUCCESS) return result; @@ -1715,9 +1713,7 @@ void vdo_repair(struct vdo_completion *parent) vdo_log_warning("Device was dirty, rebuilding reference counts"); } - result = vdo_allocate_extended(struct repair_completion, page_count, - struct vdo_page_completion, __func__, - &repair); + result = vdo_allocate_extended(page_count, page_completions, __func__, &repair); if (result != VDO_SUCCESS) { vdo_fail_completion(parent, result); return; @@ -1729,12 +1725,11 @@ void vdo_repair(struct vdo_completion *parent) prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN); repair->page_count = page_count; - result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__, - &repair->journal_data); + result = vdo_allocate(remaining * VDO_BLOCK_SIZE, __func__, &repair->journal_data); if (abort_on_error(result, repair)) return; - result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios); + result = vdo_allocate(vio_count, __func__, &repair->vios); if (abort_on_error(result, repair)) return; diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 034ecaa51f48..7fcbb361b38d 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -2453,8 +2453,7 @@ static int allocate_slab_counters(struct vdo_slab *slab) if (result != VDO_SUCCESS) return result; - result = vdo_allocate(slab->reference_block_count, struct reference_block, - __func__, &slab->reference_blocks); + result = vdo_allocate(slab->reference_block_count, __func__, &slab->reference_blocks); if (result != VDO_SUCCESS) return result; @@ -2463,8 +2462,7 @@ static int allocate_slab_counters(struct vdo_slab *slab) * so we can word-search even at the very end. */ bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD); - result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array", - &slab->counters); + result = vdo_allocate(bytes, "ref counts array", &slab->counters); if (result != VDO_SUCCESS) { vdo_free(vdo_forget(slab->reference_blocks)); return result; @@ -3563,8 +3561,7 @@ static int get_slab_statuses(struct block_allocator *allocator, struct slab_status *statuses; struct slab_iterator iterator = get_slab_iterator(allocator); - result = vdo_allocate(allocator->slab_count, struct slab_status, __func__, - &statuses); + result = vdo_allocate(allocator->slab_count, __func__, &statuses); if (result != VDO_SUCCESS) return result; @@ -3739,13 +3736,12 @@ static int initialize_slab_journal(struct vdo_slab *slab) const struct slab_config *slab_config = &slab->allocator->depot->slab_config; int result; - result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock, - __func__, &journal->locks); + result = vdo_allocate(slab_config->slab_journal_blocks, __func__, &journal->locks); if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block", - (char **) &journal->block); + BUILD_BUG_ON(sizeof(*journal->block) != VDO_BLOCK_SIZE); + result = vdo_allocate(1, "struct packed_slab_journal_block", &journal->block); if (result != VDO_SUCCESS) return result; @@ -3800,7 +3796,7 @@ static int __must_check make_slab(physical_block_number_t slab_origin, struct vdo_slab *slab; int result; - result = vdo_allocate(1, struct vdo_slab, __func__, &slab); + result = vdo_allocate(1, __func__, &slab); if (result != VDO_SUCCESS) return result; @@ -3857,8 +3853,7 @@ static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) physical_block_number_t slab_origin; int result; - result = vdo_allocate(slab_count, struct vdo_slab *, - "slab pointer array", &depot->new_slabs); + result = vdo_allocate(slab_count, "slab pointer array", &depot->new_slabs); if (result != VDO_SUCCESS) return result; @@ -4011,8 +4006,7 @@ static int initialize_slab_scrubber(struct block_allocator *allocator) char *journal_data; int result; - result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, - char, __func__, &journal_data); + result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, __func__, &journal_data); if (result != VDO_SUCCESS) return result; @@ -4045,7 +4039,7 @@ static int __must_check initialize_slab_summary_block(struct block_allocator *al struct slab_summary_block *block = &allocator->summary_blocks[index]; int result; - result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries); + result = vdo_allocate(VDO_BLOCK_SIZE, __func__, &block->outgoing_entries); if (result != VDO_SUCCESS) return result; @@ -4114,8 +4108,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, - struct slab_summary_block, __func__, + result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, __func__, &allocator->summary_blocks); if (result != VDO_SUCCESS) return result; @@ -4174,8 +4167,7 @@ static int allocate_components(struct slab_depot *depot, depot->summary_origin = summary_partition->offset; depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift); - result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, - struct slab_summary_entry, __func__, + result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, __func__, &depot->summary_entries); if (result != VDO_SUCCESS) return result; @@ -4262,9 +4254,12 @@ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo, } slab_size_shift = ilog2(slab_size); - result = vdo_allocate_extended(struct slab_depot, - vdo->thread_config.physical_zone_count, - struct block_allocator, __func__, &depot); + if (state.zone_count > MAX_VDO_PHYSICAL_ZONES) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "invalid zone count"); + + result = vdo_allocate_extended(vdo->thread_config.physical_zone_count, + allocators, __func__, &depot); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index fadc0c9d4dc4..6bfd61c937b6 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -509,7 +509,7 @@ struct slab_depot { struct slab_summary_entry *summary_entries; /* The block allocators for this depot */ - struct block_allocator allocators[]; + struct block_allocator allocators[] __counted_by(zone_count); }; struct reference_updater; diff --git a/drivers/md/dm-vdo/status-codes.c b/drivers/md/dm-vdo/status-codes.c index dd252d660b6d..9df5e4d7f884 100644 --- a/drivers/md/dm-vdo/status-codes.c +++ b/drivers/md/dm-vdo/status-codes.c @@ -80,6 +80,8 @@ int vdo_status_to_errno(int error) /* VDO or UDS error */ switch (error) { + case VDO_BAD_CONFIGURATION: + return -EINVAL; case VDO_NO_SPACE: return -ENOSPC; case VDO_READ_ONLY: diff --git a/drivers/md/dm-vdo/thread-utils.c b/drivers/md/dm-vdo/thread-utils.c index ec08478dd013..826afc952b56 100644 --- a/drivers/md/dm-vdo/thread-utils.c +++ b/drivers/md/dm-vdo/thread-utils.c @@ -56,7 +56,7 @@ int vdo_create_thread(void (*thread_function)(void *), void *thread_data, struct thread *thread; int result; - result = vdo_allocate(1, struct thread, __func__, &thread); + result = vdo_allocate(1, __func__, &thread); if (result != VDO_SUCCESS) { vdo_log_warning("Error allocating memory for %s", name); return result; diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index cdf36e7d7702..0d60a88aa086 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -227,6 +227,9 @@ struct device_config { bool compression; struct thread_count_config thread_counts; block_count_t max_discard_blocks; + block_count_t slab_blocks; + int index_memory; + bool index_sparse; }; enum vdo_completion_type { diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index 09fd0628d18c..7bec2418c121 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -34,7 +34,9 @@ #include <linux/lz4.h> #include <linux/mutex.h> #include <linux/spinlock.h> +#include <linux/string.h> #include <linux/types.h> +#include <linux/uuid.h> #include "logger.h" #include "memory-alloc.h" @@ -55,6 +57,7 @@ #include "slab-depot.h" #include "statistics.h" #include "status-codes.h" +#include "time-utils.h" #include "vio.h" #define PARANOID_THREAD_CONSISTENCY_CHECKS 0 @@ -207,29 +210,28 @@ static int __must_check initialize_thread_config(struct thread_count_config coun config->hash_zone_count = counts.hash_zones; } - result = vdo_allocate(config->logical_zone_count, thread_id_t, - "logical thread array", &config->logical_threads); + result = vdo_allocate(config->logical_zone_count, "logical thread array", + &config->logical_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->physical_zone_count, thread_id_t, - "physical thread array", &config->physical_threads); + result = vdo_allocate(config->physical_zone_count, "physical thread array", + &config->physical_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->hash_zone_count, thread_id_t, - "hash thread array", &config->hash_zone_threads); + result = vdo_allocate(config->hash_zone_count, "hash thread array", + &config->hash_zone_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; } - result = vdo_allocate(config->bio_thread_count, thread_id_t, - "bio thread array", &config->bio_threads); + result = vdo_allocate(config->bio_thread_count, "bio thread array", &config->bio_threads); if (result != VDO_SUCCESS) { uninitialize_thread_config(config); return result; @@ -256,56 +258,35 @@ static int __must_check initialize_thread_config(struct thread_count_config coun return VDO_SUCCESS; } -/** - * read_geometry_block() - Synchronously read the geometry block from a vdo's underlying block - * device. - * @vdo: The vdo whose geometry is to be read. - * - * Return: VDO_SUCCESS or an error code. - */ -static int __must_check read_geometry_block(struct vdo *vdo) +static int initialize_geometry_block(struct vdo *vdo, + struct vdo_geometry_block *geometry_block) { - struct vio *vio; - char *block; int result; - result = vdo_allocate(VDO_BLOCK_SIZE, u8, __func__, &block); + result = vdo_allocate(VDO_BLOCK_SIZE, "encoded geometry block", + (char **) &vdo->geometry_block.buffer); if (result != VDO_SUCCESS) return result; - result = create_metadata_vio(vdo, VIO_TYPE_GEOMETRY, VIO_PRIORITY_HIGH, NULL, - block, &vio); - if (result != VDO_SUCCESS) { - vdo_free(block); - return result; - } + return allocate_vio_components(vdo, VIO_TYPE_GEOMETRY, + VIO_PRIORITY_METADATA, NULL, 1, + (char *) geometry_block->buffer, + &vdo->geometry_block.vio); +} - /* - * This is only safe because, having not already loaded the geometry, the vdo's geometry's - * bio_offset field is 0, so the fact that vio_reset_bio() will subtract that offset from - * the supplied pbn is not a problem. - */ - result = vio_reset_bio(vio, block, NULL, REQ_OP_READ, - VDO_GEOMETRY_BLOCK_LOCATION); - if (result != VDO_SUCCESS) { - free_vio(vdo_forget(vio)); - vdo_free(block); - return result; - } +static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block) +{ + int result; - bio_set_dev(vio->bio, vdo_get_backing_device(vdo)); - submit_bio_wait(vio->bio); - result = blk_status_to_errno(vio->bio->bi_status); - free_vio(vdo_forget(vio)); - if (result != 0) { - vdo_log_error_strerror(result, "synchronous read failed"); - vdo_free(block); - return -EIO; - } + result = vdo_allocate(VDO_BLOCK_SIZE, "encoded super block", + (char **) &vdo->super_block.buffer); + if (result != VDO_SUCCESS) + return result; - result = vdo_parse_geometry_block((u8 *) block, &vdo->geometry); - vdo_free(block); - return result; + return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK, + VIO_PRIORITY_METADATA, NULL, 1, + (char *) super_block->buffer, + &vdo->super_block.vio); } static bool get_zone_thread_name(const thread_id_t thread_ids[], zone_count_t count, @@ -453,6 +434,69 @@ static int register_vdo(struct vdo *vdo) } /** + * vdo_format() - Format a block device to function as a new VDO. + * @vdo: The vdo to format. + * @error_ptr: The reason for any failure during this call. + * + * This function must be called on a device before a VDO can be loaded for the first time. + * Once a device has been formatted, the VDO can be loaded and shut down repeatedly. + * If a new VDO is desired, this function should be called again. + * + * Return: VDO_SUCCESS or an error + **/ +static int __must_check vdo_format(struct vdo *vdo, char **error_ptr) +{ + int result; + uuid_t uuid; + nonce_t nonce = current_time_us(); + struct device_config *config = vdo->device_config; + + struct index_config index_config = { + .mem = config->index_memory, + .sparse = config->index_sparse, + }; + + struct vdo_config vdo_config = { + .logical_blocks = config->logical_blocks, + .physical_blocks = config->physical_blocks, + .slab_size = config->slab_blocks, + .slab_journal_blocks = DEFAULT_VDO_SLAB_JOURNAL_SIZE, + .recovery_journal_size = DEFAULT_VDO_RECOVERY_JOURNAL_SIZE, + }; + + uuid_gen(&uuid); + result = vdo_initialize_volume_geometry(nonce, &uuid, &index_config, &vdo->geometry); + if (result != VDO_SUCCESS) { + *error_ptr = "Could not initialize volume geometry during format"; + return result; + } + + result = vdo_initialize_component_states(&vdo_config, &vdo->geometry, nonce, &vdo->states); + if (result == VDO_NO_SPACE) { + block_count_t slab_blocks = config->slab_blocks; + /* 1 is counting geometry block */ + block_count_t fixed_layout_size = 1 + + vdo->geometry.regions[VDO_DATA_REGION].start_block + + DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT + + DEFAULT_VDO_RECOVERY_JOURNAL_SIZE + VDO_SLAB_SUMMARY_BLOCKS; + block_count_t necessary_size = fixed_layout_size + slab_blocks; + + vdo_log_error("Minimum required size for VDO volume: %llu bytes", + (unsigned long long) necessary_size * VDO_BLOCK_SIZE); + *error_ptr = "Could not allocate enough space for VDO during format"; + return result; + } + if (result != VDO_SUCCESS) { + *error_ptr = "Could not initialize data layout during format"; + return result; + } + + vdo->needs_formatting = true; + + return VDO_SUCCESS; +} + +/** * initialize_vdo() - Do the portion of initializing a vdo which will clean up after itself on * error. * @vdo: The vdo being initialized @@ -475,12 +519,39 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, vdo_initialize_completion(&vdo->admin.completion, vdo, VDO_ADMIN_COMPLETION); init_completion(&vdo->admin.callback_sync); mutex_init(&vdo->stats_mutex); - result = read_geometry_block(vdo); + + result = initialize_geometry_block(vdo, &vdo->geometry_block); + if (result != VDO_SUCCESS) { + *reason = "Could not initialize geometry block"; + return result; + } + + result = initialize_super_block(vdo, &vdo->super_block); + if (result != VDO_SUCCESS) { + *reason = "Could not initialize super block"; + return result; + } + + result = vdo_submit_metadata_vio_wait(&vdo->geometry_block.vio, + VDO_GEOMETRY_BLOCK_LOCATION, REQ_OP_READ); if (result != VDO_SUCCESS) { *reason = "Could not load geometry block"; return result; } + if (mem_is_zero(vdo->geometry_block.vio.data, VDO_BLOCK_SIZE)) { + result = vdo_format(vdo, reason); + if (result != VDO_SUCCESS) + return result; + } else { + result = vdo_parse_geometry_block(vdo->geometry_block.buffer, + &vdo->geometry); + if (result != VDO_SUCCESS) { + *reason = "Could not parse geometry block"; + return result; + } + } + result = initialize_thread_config(config->thread_counts, &vdo->thread_config); if (result != VDO_SUCCESS) { *reason = "Cannot create thread configuration"; @@ -493,7 +564,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, config->thread_counts.hash_zones, vdo->thread_config.thread_count); /* Compression context storage */ - result = vdo_allocate(config->thread_counts.cpu_threads, char *, "LZ4 context", + result = vdo_allocate(config->thread_counts.cpu_threads, "LZ4 context", &vdo->compression_context); if (result != VDO_SUCCESS) { *reason = "cannot allocate LZ4 context"; @@ -501,7 +572,7 @@ static int initialize_vdo(struct vdo *vdo, struct device_config *config, } for (i = 0; i < config->thread_counts.cpu_threads; i++) { - result = vdo_allocate(LZ4_MEM_COMPRESS, char, "LZ4 context", + result = vdo_allocate(LZ4_MEM_COMPRESS, "LZ4 context", &vdo->compression_context[i]); if (result != VDO_SUCCESS) { *reason = "cannot allocate LZ4 context"; @@ -537,7 +608,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, /* Initialize with a generic failure reason to prevent returning garbage. */ *reason = "Unspecified error"; - result = vdo_allocate(1, struct vdo, __func__, &vdo); + result = vdo_allocate(1, __func__, &vdo); if (result != VDO_SUCCESS) { *reason = "Cannot allocate VDO"; return result; @@ -554,8 +625,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), "vdo%u", instance); - result = vdo_allocate(vdo->thread_config.thread_count, - struct vdo_thread, __func__, &vdo->threads); + result = vdo_allocate(vdo->thread_config.thread_count, __func__, &vdo->threads); if (result != VDO_SUCCESS) { *reason = "Cannot allocate thread structures"; return result; @@ -648,6 +718,12 @@ static void free_listeners(struct vdo_thread *thread) } } +static void uninitialize_geometry_block(struct vdo_geometry_block *geometry_block) +{ + free_vio_components(&geometry_block->vio); + vdo_free(geometry_block->buffer); +} + static void uninitialize_super_block(struct vdo_super_block *super_block) { free_vio_components(&super_block->vio); @@ -695,6 +771,7 @@ void vdo_destroy(struct vdo *vdo) vdo_uninitialize_layout(&vdo->next_layout); if (vdo->partition_copier) dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier)); + uninitialize_geometry_block(&vdo->geometry_block); uninitialize_super_block(&vdo->super_block); vdo_free_block_map(vdo_forget(vdo->block_map)); vdo_free_hash_zones(vdo_forget(vdo->hash_zones)); @@ -720,21 +797,6 @@ void vdo_destroy(struct vdo *vdo) vdo_free(vdo); } -static int initialize_super_block(struct vdo *vdo, struct vdo_super_block *super_block) -{ - int result; - - result = vdo_allocate(VDO_BLOCK_SIZE, char, "encoded super block", - (char **) &vdo->super_block.buffer); - if (result != VDO_SUCCESS) - return result; - - return allocate_vio_components(vdo, VIO_TYPE_SUPER_BLOCK, - VIO_PRIORITY_METADATA, NULL, 1, - (char *) super_block->buffer, - &vdo->super_block.vio); -} - /** * finish_reading_super_block() - Continue after loading the super block. * @completion: The super block vio. @@ -778,14 +840,6 @@ static void read_super_block_endio(struct bio *bio) */ void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent) { - int result; - - result = initialize_super_block(vdo, &vdo->super_block); - if (result != VDO_SUCCESS) { - vdo_continue_completion(parent, result); - return; - } - vdo->super_block.vio.completion.parent = parent; vdo_submit_metadata_vio(&vdo->super_block.vio, vdo_get_data_region_start(vdo->geometry), @@ -899,24 +953,101 @@ static void record_vdo(struct vdo *vdo) vdo->states.layout = vdo->layout; } +static int __must_check clear_partition(struct vdo *vdo, enum partition_id id) +{ + struct partition *partition; + int result; + + result = vdo_get_partition(&vdo->states.layout, id, &partition); + if (result != VDO_SUCCESS) + return result; + + return blkdev_issue_zeroout(vdo_get_backing_device(vdo), + partition->offset * VDO_SECTORS_PER_BLOCK, + partition->count * VDO_SECTORS_PER_BLOCK, + GFP_NOWAIT, 0); +} + +int vdo_clear_layout(struct vdo *vdo) +{ + int result; + + /* Zero out the uds index's first block. */ + result = blkdev_issue_zeroout(vdo_get_backing_device(vdo), + VDO_SECTORS_PER_BLOCK, + VDO_SECTORS_PER_BLOCK, + GFP_NOWAIT, 0); + if (result != VDO_SUCCESS) + return result; + + result = clear_partition(vdo, VDO_BLOCK_MAP_PARTITION); + if (result != VDO_SUCCESS) + return result; + + return clear_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION); +} + /** - * continue_super_block_parent() - Continue the parent of a super block save operation. - * @completion: The super block vio. + * continue_parent() - Continue the parent of a save operation. + * @completion: The completion to continue. * - * This callback is registered in vdo_save_components(). */ -static void continue_super_block_parent(struct vdo_completion *completion) +static void continue_parent(struct vdo_completion *completion) { vdo_continue_completion(vdo_forget(completion->parent), completion->result); } +static void handle_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_completion *parent = vio->completion.parent; + + continue_vio_after_io(vio, continue_parent, + parent->callback_thread_id); +} + /** - * handle_save_error() - Log a super block save error. + * handle_geometry_block_save_error() - Log a geometry block save error. + * @completion: The super block vio. + * + * This error handler is registered in vdo_save_geometry_block(). + */ +static void handle_geometry_block_save_error(struct vdo_completion *completion) +{ + struct vdo_geometry_block *geometry_block = + container_of(as_vio(completion), struct vdo_geometry_block, vio); + + vio_record_metadata_io_error(&geometry_block->vio); + vdo_log_error_strerror(completion->result, "geometry block save failed"); + completion->callback(completion); +} + +/** + * vdo_save_geometry_block() - Encode the vdo and save the geometry block asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + */ +void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent) +{ + struct vdo_geometry_block *geometry_block = &vdo->geometry_block; + + vdo_encode_volume_geometry(geometry_block->buffer, &vdo->geometry, + VDO_DEFAULT_GEOMETRY_BLOCK_VERSION); + geometry_block->vio.completion.parent = parent; + geometry_block->vio.completion.callback_thread_id = parent->callback_thread_id; + vdo_submit_metadata_vio(&geometry_block->vio, + VDO_GEOMETRY_BLOCK_LOCATION, + handle_write_endio, handle_geometry_block_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); +} + +/** + * handle_super_block_save_error() - Log a super block save error. * @completion: The super block vio. * * This error handler is registered in vdo_save_components(). */ -static void handle_save_error(struct vdo_completion *completion) +static void handle_super_block_save_error(struct vdo_completion *completion) { struct vdo_super_block *super_block = container_of(as_vio(completion), struct vdo_super_block, vio); @@ -935,17 +1066,27 @@ static void handle_save_error(struct vdo_completion *completion) completion->callback(completion); } -static void super_block_write_endio(struct bio *bio) +/** + * vdo_save_super_block() - Save the component states to the super block asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + */ +void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent) { - struct vio *vio = bio->bi_private; - struct vdo_completion *parent = vio->completion.parent; + struct vdo_super_block *super_block = &vdo->super_block; - continue_vio_after_io(vio, continue_super_block_parent, - parent->callback_thread_id); + vdo_encode_super_block(super_block->buffer, &vdo->states); + super_block->vio.completion.parent = parent; + super_block->vio.completion.callback_thread_id = parent->callback_thread_id; + vdo_submit_metadata_vio(&super_block->vio, + vdo_get_data_region_start(vdo->geometry), + handle_write_endio, handle_super_block_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); } /** - * vdo_save_components() - Encode the vdo and save the super block asynchronously. + * vdo_save_components() - Copy the current state of the VDO to the states struct and save + * it to the super block asynchronously. * @vdo: The vdo whose state is being saved. * @parent: The completion to notify when the save is complete. */ @@ -964,14 +1105,7 @@ void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent) } record_vdo(vdo); - - vdo_encode_super_block(super_block->buffer, &vdo->states); - super_block->vio.completion.parent = parent; - super_block->vio.completion.callback_thread_id = parent->callback_thread_id; - vdo_submit_metadata_vio(&super_block->vio, - vdo_get_data_region_start(vdo->geometry), - super_block_write_endio, handle_save_error, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); + vdo_save_super_block(vdo, parent); } /** @@ -997,8 +1131,7 @@ int vdo_register_read_only_listener(struct vdo *vdo, void *listener, if (result != VDO_SUCCESS) return result; - result = vdo_allocate(1, struct read_only_listener, __func__, - &read_only_listener); + result = vdo_allocate(1, __func__, &read_only_listener); if (result != VDO_SUCCESS) return result; diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h index 1aaba73997b7..9a63f5d45ce3 100644 --- a/drivers/md/dm-vdo/vdo.h +++ b/drivers/md/dm-vdo/vdo.h @@ -144,6 +144,13 @@ struct thread_config { struct thread_count_config; +struct vdo_geometry_block { + /* The vio for reading and writing the geometry block to disk */ + struct vio vio; + /* A buffer to hold the geometry block */ + u8 *buffer; +}; + struct vdo_super_block { /* The vio for reading and writing the super block to disk */ struct vio vio; @@ -186,6 +193,9 @@ struct vdo { /* The thread mapping */ struct thread_config thread_config; + /* The geometry block */ + struct vdo_geometry_block geometry_block; + /* The super block */ struct vdo_super_block super_block; @@ -236,6 +246,7 @@ struct vdo { const struct admin_state_code *suspend_type; bool allocations_allowed; bool dump_on_shutdown; + bool needs_formatting; atomic_t processing_message; /* @@ -304,6 +315,10 @@ int __must_check vdo_make(unsigned int instance, struct device_config *config, void vdo_destroy(struct vdo *vdo); +int __must_check vdo_format_components(struct vdo *vdo); + +void vdo_format_super_block(struct vdo *vdo, struct vdo_completion *parent); + void vdo_load_super_block(struct vdo *vdo, struct vdo_completion *parent); struct block_device * __must_check vdo_get_backing_device(const struct vdo *vdo); @@ -326,6 +341,10 @@ enum vdo_state __must_check vdo_get_state(const struct vdo *vdo); void vdo_set_state(struct vdo *vdo, enum vdo_state state); +int vdo_clear_layout(struct vdo *vdo); +void vdo_save_geometry_block(struct vdo *vdo, struct vdo_completion *parent); +void vdo_save_super_block(struct vdo *vdo, struct vdo_completion *parent); + void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent); int vdo_register_read_only_listener(struct vdo *vdo, void *listener, diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index 5ffc867d9c5e..ea8ac619ff1b 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -52,8 +52,8 @@ static int create_multi_block_bio(block_count_t size, struct bio **bio_ptr) struct bio *bio = NULL; int result; - result = vdo_allocate_extended(struct bio, size + 1, struct bio_vec, - "bio", &bio); + result = vdo_allocate_memory(sizeof(struct bio) + sizeof(struct bio_vec) * (size + 1), + __alignof__(struct bio), "bio", &bio); if (result != VDO_SUCCESS) return result; @@ -129,7 +129,7 @@ int create_multi_block_metadata_vio(struct vdo *vdo, enum vio_type vio_type, * Metadata vios should use direct allocation and not use the buffer pool, which is * reserved for submissions from the linux block layer. */ - result = vdo_allocate(1, struct vio, __func__, &vio); + result = vdo_allocate(1, __func__, &vio); if (result != VDO_SUCCESS) { vdo_log_error("metadata vio allocation failure %d", result); return result; @@ -327,8 +327,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_ int result; size_t per_vio_size = VDO_BLOCK_SIZE * block_count; - result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, - __func__, &pool); + result = vdo_allocate_extended(pool_size, vios, __func__, &pool); if (result != VDO_SUCCESS) return result; @@ -336,8 +335,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_ INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * per_vio_size, char, - "VIO pool buffer", &pool->buffer); + result = vdo_allocate(pool_size * per_vio_size, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); return result; diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 14be4d888af3..85ad9dc210ff 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -11,163 +11,131 @@ #define DM_MSG_PREFIX "verity-fec" /* - * When correcting a data block, the FEC code performs optimally when it can - * collect all the associated RS blocks at the same time. As each byte is part - * of a different RS block, there are '1 << data_dev_block_bits' RS blocks. - * There are '1 << DM_VERITY_FEC_BUF_RS_BITS' RS blocks per buffer, so that - * gives '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers. + * When correcting a block, the FEC implementation performs optimally when it + * can collect all the associated RS codewords at the same time. As each byte + * is part of a different codeword, there are '1 << data_dev_block_bits' + * codewords. Each buffer has space for the message bytes for + * '1 << DM_VERITY_FEC_BUF_RS_BITS' codewords, so that gives + * '1 << (data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS)' buffers. */ static inline unsigned int fec_max_nbufs(struct dm_verity *v) { return 1 << (v->data_dev_block_bits - DM_VERITY_FEC_BUF_RS_BITS); } -/* - * Return an interleaved offset for a byte in RS block. - */ -static inline u64 fec_interleave(struct dm_verity *v, u64 offset) -{ - u32 mod; - - mod = do_div(offset, v->fec->rsn); - return offset + mod * (v->fec->rounds << v->data_dev_block_bits); -} - -/* - * Read error-correcting codes for the requested RS block. Returns a pointer - * to the data block. Caller is responsible for releasing buf. - */ -static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, unsigned int par_buf_offset, - struct dm_buffer **buf, unsigned short ioprio) -{ - u64 position, block, rem; - u8 *res; - - /* We have already part of parity bytes read, skip to the next block */ - if (par_buf_offset) - index++; - - position = (index + rsb) * v->fec->roots; - block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = par_buf_offset ? 0 : (unsigned int)rem; - - res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); - if (IS_ERR(res)) { - DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, (unsigned long long)rsb, - (unsigned long long)block, PTR_ERR(res)); - *buf = NULL; - } - - return res; -} - /* Loop over each allocated buffer. */ #define fec_for_each_buffer(io, __i) \ for (__i = 0; __i < (io)->nbufs; __i++) -/* Loop over each RS block in each allocated buffer. */ -#define fec_for_each_buffer_rs_block(io, __i, __j) \ +/* Loop over each RS message in each allocated buffer. */ +/* To stop early, use 'goto', not 'break' (since this uses nested loops). */ +#define fec_for_each_buffer_rs_message(io, __i, __j) \ fec_for_each_buffer(io, __i) \ for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) /* - * Return a pointer to the current RS block when called inside - * fec_for_each_buffer_rs_block. + * Return a pointer to the current RS message when called inside + * fec_for_each_buffer_rs_message. */ -static inline u8 *fec_buffer_rs_block(struct dm_verity *v, - struct dm_verity_fec_io *fio, - unsigned int i, unsigned int j) +static inline u8 *fec_buffer_rs_message(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned int i, unsigned int j) { - return &fio->bufs[i][j * v->fec->rsn]; + return &fio->bufs[i][j * v->fec->rs_k]; } /* - * Return an index to the current RS block when called inside - * fec_for_each_buffer_rs_block. - */ -static inline unsigned int fec_buffer_rs_index(unsigned int i, unsigned int j) -{ - return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; -} - -/* - * Decode all RS blocks from buffers and copy corrected bytes into fio->output - * starting from block_offset. + * Decode all RS codewords whose message bytes were loaded into fio->bufs. Copy + * the corrected bytes into fio->output starting from out_pos. */ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 rsb, int byte_index, - unsigned int block_offset, int neras) + struct dm_verity_fec_io *fio, u64 target_block, + unsigned int target_region, u64 index_in_region, + unsigned int out_pos, int neras) { - int r, corrected = 0, res; + int r = 0, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, j, offset, par_buf_offset = 0; - uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; - u8 *par, *block; + unsigned int n, i, j, parity_pos, to_copy; + uint16_t par_buf[DM_VERITY_FEC_MAX_ROOTS]; + u8 *par, *msg_buf; + u64 parity_block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio->bi_ioprio); - if (IS_ERR(par)) + /* + * Compute the index of the first parity block that will be needed and + * the starting position in that block. Then read that block. + * + * block_size is always a power of 2, but roots might not be. Note that + * when it's not, a codeword's parity bytes can span a block boundary. + */ + parity_block = ((index_in_region << v->data_dev_block_bits) + out_pos) * + v->fec->roots; + parity_pos = parity_block & (v->fec->block_size - 1); + parity_block >>= v->data_dev_block_bits; + par = dm_bufio_read_with_ioprio(v->fec->bufio, parity_block, &buf, + bio->bi_ioprio); + if (IS_ERR(par)) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, target_block, parity_block, + PTR_ERR(par)); return PTR_ERR(par); + } /* - * Decode the RS blocks we have in bufs. Each RS block results in - * one corrected target byte and consumes fec->roots parity bytes. + * Decode the RS codewords whose message bytes are in bufs. Each RS + * codeword results in one corrected target byte and consumes fec->roots + * parity bytes. */ - fec_for_each_buffer_rs_block(fio, n, i) { - block = fec_buffer_rs_block(v, fio, n, i); - for (j = 0; j < v->fec->roots - par_buf_offset; j++) - par_buf[par_buf_offset + j] = par[offset + j]; - /* Decode an RS block using Reed-Solomon */ - res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + fec_for_each_buffer_rs_message(fio, n, i) { + msg_buf = fec_buffer_rs_message(v, fio, n, i); + + /* + * Copy the next 'roots' parity bytes to 'par_buf', reading + * another parity block if needed. + */ + to_copy = min(v->fec->block_size - parity_pos, v->fec->roots); + for (j = 0; j < to_copy; j++) + par_buf[j] = par[parity_pos++]; + if (to_copy < v->fec->roots) { + parity_block++; + parity_pos = 0; + + dm_bufio_release(buf); + par = dm_bufio_read_with_ioprio(v->fec->bufio, + parity_block, &buf, + bio->bi_ioprio); + if (IS_ERR(par)) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, target_block, + parity_block, PTR_ERR(par)); + return PTR_ERR(par); + } + for (; j < v->fec->roots; j++) + par_buf[j] = par[parity_pos++]; + } + + /* Decode an RS codeword using the Reed-Solomon library. */ + res = decode_rs8(fio->rs, msg_buf, par_buf, v->fec->rs_k, NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; - goto error; + goto done; } - corrected += res; - fio->output[block_offset] = block[byte_index]; + fio->output[out_pos++] = msg_buf[target_region]; - block_offset++; - if (block_offset >= 1 << v->data_dev_block_bits) + if (out_pos >= v->fec->block_size) goto done; - - /* Read the next block when we run out of parity bytes */ - offset += (v->fec->roots - par_buf_offset); - /* Check if parity bytes are split between blocks */ - if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { - par_buf_offset = v->fec->io_size - offset; - for (j = 0; j < par_buf_offset; j++) - par_buf[j] = par[offset + j]; - offset += par_buf_offset; - } else - par_buf_offset = 0; - - if (offset >= v->fec->io_size) { - dm_bufio_release(buf); - - par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio->bi_ioprio); - if (IS_ERR(par)) - return PTR_ERR(par); - } } done: - r = corrected; -error: dm_bufio_release(buf); if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", - v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) { + v->data_dev->name, target_block, r); + else if (r == 0) DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", - v->data_dev->name, (unsigned long long)rsb, r); - atomic64_inc(&v->fec->corrected); - } + v->data_dev->name, target_block, corrected); return r; } @@ -178,7 +146,7 @@ error: static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, const u8 *want_digest, const u8 *data) { - if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, + if (unlikely(verity_hash(v, io, data, v->fec->block_size, io->tmp_digest))) return 0; @@ -186,22 +154,35 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, } /* - * Read data blocks that are part of the RS block and deinterleave as much as - * fits into buffers. Check for erasure locations if @neras is non-NULL. + * Read the message block at index @index_in_region within each of the + * @v->fec->rs_k regions and deinterleave their contents into @io->fec_io->bufs. + * + * @target_block gives the index of specific block within this sequence that is + * being corrected, relative to the start of all the FEC message blocks. + * + * @out_pos gives the current output position, i.e. the position in (each) block + * from which to start the deinterleaving. Deinterleaving continues until + * either end-of-block is reached or there's no more buffer space. + * + * If @neras is non-NULL, then also use verity hashes and the presence/absence + * of I/O errors to determine which of the message blocks in the sequence are + * likely to be incorrect. Write the number of such blocks to *@neras and the + * indices of the corresponding RS message bytes in [0, k - 1] to + * @io->fec_io->erasures, up to a limit of @v->fec->roots + 1 such blocks. */ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, - u64 rsb, u64 target, unsigned int block_offset, - int *neras) + u64 target_block, u64 index_in_region, + unsigned int out_pos, int *neras) { bool is_zero; - int i, j, target_index = -1; + int i, j; struct dm_buffer *buf; struct dm_bufio_client *bufio; struct dm_verity_fec_io *fio = io->fec_io; - u64 block, ileaved; - u8 *bbuf, *rs_block; + u64 block; + u8 *bbuf; u8 want_digest[HASH_MAX_DIGESTSIZE]; - unsigned int n, k; + unsigned int n, src_pos; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); if (neras) @@ -210,21 +191,12 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, if (WARN_ON(v->digest_size > sizeof(want_digest))) return -EINVAL; - /* - * read each of the rsn data blocks that are part of the RS block, and - * interleave contents to available bufs - */ - for (i = 0; i < v->fec->rsn; i++) { - ileaved = fec_interleave(v, rsb * v->fec->rsn + i); - + for (i = 0; i < v->fec->rs_k; i++) { /* - * target is the data block we want to correct, target_index is - * the index of this block within the rsn RS blocks + * Read the block from region i. It contains the i'th message + * byte of the target block's RS codewords. */ - if (ileaved == target) - target_index = i; - - block = ileaved >> v->data_dev_block_bits; + block = i * v->fec->region_blocks + index_in_region; bufio = v->fec->data_bufio; if (block >= v->data_blocks) { @@ -244,9 +216,8 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", - v->data_dev->name, - (unsigned long long)rsb, - (unsigned long long)block, PTR_ERR(bbuf)); + v->data_dev->name, target_block, block, + PTR_ERR(bbuf)); /* assume the block is corrupted */ if (neras && *neras <= v->fec->roots) @@ -273,23 +244,20 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, } /* - * deinterleave and copy the bytes that fit into bufs, - * starting from block_offset + * Deinterleave the bytes of the block, starting from 'out_pos', + * into the i'th byte of the RS message buffers. Stop when + * end-of-block is reached or there are no more buffers. */ - fec_for_each_buffer_rs_block(fio, n, j) { - k = fec_buffer_rs_index(n, j) + block_offset; - - if (k >= 1 << v->data_dev_block_bits) + src_pos = out_pos; + fec_for_each_buffer_rs_message(fio, n, j) { + if (src_pos >= v->fec->block_size) goto done; - - rs_block = fec_buffer_rs_block(v, fio, n, j); - rs_block[i] = bbuf[k]; + fec_buffer_rs_message(v, fio, n, j)[i] = bbuf[src_pos++]; } done: dm_bufio_release(buf); } - - return target_index; + return 0; } /* @@ -336,47 +304,65 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) unsigned int n; fec_for_each_buffer(fio, n) - memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + memset(fio->bufs[n], 0, v->fec->rs_k << DM_VERITY_FEC_BUF_RS_BITS); memset(fio->erasures, 0, sizeof(fio->erasures)); } /* - * Decode all RS blocks in a single data block and return the target block - * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses - * hashes to locate erasures. + * Try to correct the message (data or hash) block at index @target_block. + * + * If @use_erasures is true, use verity hashes to locate erasures. This makes + * the error correction slower but up to twice as capable. + * + * On success, return 0 and write the corrected block to @fio->output. 0 is + * returned only if the digest of the corrected block matches @want_digest; this + * is critical to ensure that FEC can't cause dm-verity to return bad data. */ -static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, - struct dm_verity_fec_io *fio, u64 rsb, u64 offset, - const u8 *want_digest, bool use_erasures) +static int fec_decode(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 target_block, + const u8 *want_digest, bool use_erasures) { int r, neras = 0; - unsigned int pos; + unsigned int target_region, out_pos; + u64 index_in_region; - for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + /* + * Compute 'target_region', the index of the region the target block is + * in; and 'index_in_region', the index of the target block within its + * region. The latter value is also the index within its region of each + * message block that shares its RS codewords with the target block. + */ + target_region = div64_u64_rem(target_block, v->fec->region_blocks, + &index_in_region); + if (WARN_ON_ONCE(target_region >= v->fec->rs_k)) + /* target_block is out-of-bounds. Should never happen. */ + return -EIO; + + for (out_pos = 0; out_pos < v->fec->block_size;) { fec_init_bufs(v, fio); - r = fec_read_bufs(v, io, rsb, offset, pos, + r = fec_read_bufs(v, io, target_block, index_in_region, out_pos, use_erasures ? &neras : NULL); if (unlikely(r < 0)) return r; - r = fec_decode_bufs(v, io, fio, rsb, r, pos, neras); + r = fec_decode_bufs(v, io, fio, target_block, target_region, + index_in_region, out_pos, neras); if (r < 0) return r; - pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + out_pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; } /* Always re-validate the corrected block against the expected hash */ - r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - io->tmp_digest); + r = verity_hash(v, io, fio->output, v->fec->block_size, io->tmp_digest); if (unlikely(r < 0)) return r; if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", - v->data_dev->name, (unsigned long long)rsb, neras); + v->data_dev->name, target_block, neras); return -EILSEQ; } @@ -390,7 +376,6 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, { int r; struct dm_verity_fec_io *fio; - u64 offset, res, rsb; if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; @@ -408,37 +393,19 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, block = block - v->hash_start + v->data_blocks; /* - * For RS(M, N), the continuous FEC data is divided into blocks of N - * bytes. Since block size may not be divisible by N, the last block - * is zero padded when decoding. - * - * Each byte of the block is covered by a different RS(M, N) code, - * and each code is interleaved over N blocks to make it less likely - * that bursty corruption will leave us in unrecoverable state. - */ - - offset = block << v->data_dev_block_bits; - res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); - - /* - * The base RS block we can feed to the interleaver to find out all - * blocks required for decoding. - */ - rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); - - /* * Locating erasures is slow, so attempt to recover the block without * them first. Do a second attempt with erasures if the corruption is * bad enough. */ - r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false); + r = fec_decode(v, io, fio, block, want_digest, false); if (r < 0) { - r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true); + r = fec_decode(v, io, fio, block, want_digest, true); if (r < 0) goto done; } - memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + memcpy(dest, fio->output, v->fec->block_size); + atomic64_inc(&v->fec->corrected); done: fio->level--; @@ -585,8 +552,8 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || - num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || - num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + num_c < DM_VERITY_FEC_MIN_ROOTS || + num_c > DM_VERITY_FEC_MAX_ROOTS) { ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; return -EINVAL; } @@ -625,7 +592,7 @@ int verity_fec_ctr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; - u64 hash_blocks, fec_blocks; + u64 hash_blocks; int ret; if (!verity_fec_is_enabled(v)) { @@ -648,7 +615,7 @@ int verity_fec_ctr(struct dm_verity *v) * hash device after the hash blocks. */ - hash_blocks = v->hash_blocks - v->hash_start; + hash_blocks = v->hash_end - v->hash_start; /* * Require matching block sizes for data and hash devices for @@ -658,27 +625,28 @@ int verity_fec_ctr(struct dm_verity *v) ti->error = "Block sizes must match to use FEC"; return -EINVAL; } + f->block_size = 1 << v->data_dev_block_bits; if (!f->roots) { ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; return -EINVAL; } - f->rsn = DM_VERITY_FEC_RSM - f->roots; + f->rs_k = DM_VERITY_FEC_RS_N - f->roots; if (!f->blocks) { ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; return -EINVAL; } - f->rounds = f->blocks; - if (sector_div(f->rounds, f->rsn)) - f->rounds++; + f->region_blocks = f->blocks; + if (sector_div(f->region_blocks, f->rs_k)) + f->region_blocks++; /* * Due to optional metadata, f->blocks can be larger than * data_blocks and hash_blocks combined. */ - if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + if (f->blocks < v->data_blocks + hash_blocks || !f->region_blocks) { ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; return -EINVAL; } @@ -688,16 +656,14 @@ int verity_fec_ctr(struct dm_verity *v) * it to be large enough. */ f->hash_blocks = f->blocks - v->data_blocks; - if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + if (dm_bufio_get_device_size(v->bufio) < + v->hash_start + f->hash_blocks) { ti->error = "Hash device is too small for " DM_VERITY_OPT_FEC_BLOCKS; return -E2BIG; } - f->io_size = 1 << v->data_dev_block_bits; - - f->bufio = dm_bufio_client_create(f->dev->bdev, - f->io_size, + f->bufio = dm_bufio_client_create(f->dev->bdev, f->block_size, 1, 0, NULL, NULL, 0); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; @@ -706,14 +672,12 @@ int verity_fec_ctr(struct dm_verity *v) dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); - fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT); - if (dm_bufio_get_device_size(f->bufio) < fec_blocks) { + if (dm_bufio_get_device_size(f->bufio) < f->region_blocks * f->roots) { ti->error = "FEC device is too small"; return -E2BIG; } - f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, - 1 << v->data_dev_block_bits, + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, f->block_size, 1, 0, NULL, NULL, 0); if (IS_ERR(f->data_bufio)) { ti->error = "Cannot initialize FEC data bufio client"; @@ -743,7 +707,7 @@ int verity_fec_ctr(struct dm_verity *v) } f->cache = kmem_cache_create("dm_verity_fec_buffers", - f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + f->rs_k << DM_VERITY_FEC_BUF_RS_BITS, 0, 0, NULL); if (!f->cache) { ti->error = "Cannot create FEC buffer cache"; @@ -760,7 +724,7 @@ int verity_fec_ctr(struct dm_verity *v) /* Preallocate an output buffer for each thread */ ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(), - 1 << v->data_dev_block_bits); + f->block_size); if (ret) { ti->error = "Cannot allocate FEC output pool"; return ret; diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 35d28d9f8a9b..50b5e187d5cc 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -11,13 +11,13 @@ #include "dm-verity.h" #include <linux/rslib.h> -/* Reed-Solomon(M, N) parameters */ -#define DM_VERITY_FEC_RSM 255 -#define DM_VERITY_FEC_MAX_RSN 253 -#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ +/* Reed-Solomon(n, k) parameters */ +#define DM_VERITY_FEC_RS_N 255 +#define DM_VERITY_FEC_MIN_ROOTS 2 /* RS(255, 253): ~0.8% space overhead */ +#define DM_VERITY_FEC_MAX_ROOTS 24 /* RS(255, 231): ~10% space overhead */ /* buffers for deinterleaving and decoding */ -#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* log2(RS messages per buffer) */ #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" @@ -29,13 +29,13 @@ struct dm_verity_fec { struct dm_dev *dev; /* parity data device */ struct dm_bufio_client *data_bufio; /* for data dev access */ struct dm_bufio_client *bufio; /* for parity data access */ - size_t io_size; /* IO size for roots */ + size_t block_size; /* size of data, hash, and parity blocks in bytes */ sector_t start; /* parity data start in blocks */ sector_t blocks; /* number of blocks covered */ - sector_t rounds; /* number of interleaving rounds */ + sector_t region_blocks; /* blocks per region: ceil(blocks / rs_k) */ sector_t hash_blocks; /* blocks covered after v->hash_start */ - unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ - unsigned char rsn; /* N of RS(M, N) */ + unsigned char roots; /* parity bytes per RS codeword, n-k of RS(n, k) */ + unsigned char rs_k; /* message bytes per RS codeword, k of RS(n, k) */ mempool_t fio_pool; /* mempool for dm_verity_fec_io */ mempool_t rs_pool; /* mempool for fio->rs */ mempool_t prealloc_pool; /* mempool for preallocated buffers */ @@ -47,15 +47,15 @@ struct dm_verity_fec { /* per-bio data */ struct dm_verity_fec_io { struct rs_control *rs; /* Reed-Solomon state */ - int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + int erasures[DM_VERITY_FEC_MAX_ROOTS + 1]; /* erasures for decode_rs8 */ u8 *output; /* buffer for corrected output */ unsigned int level; /* recursion level */ unsigned int nbufs; /* number of buffers allocated */ /* - * Buffers for deinterleaving RS blocks. Each buffer has space for - * the data bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS blocks. The - * array length is fec_max_nbufs(v), and we try to allocate that many - * buffers. However, in low-memory situations we may be unable to + * Buffers for deinterleaving RS codewords. Each buffer has space for + * the message bytes of (1 << DM_VERITY_FEC_BUF_RS_BITS) RS codewords. + * The array length is fec_max_nbufs(v), and we try to allocate that + * many buffers. However, in low-memory situations we may be unable to * allocate all buffers. 'nbufs' holds the number actually allocated. */ u8 *bufs[]; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 61073cd01d13..9a9847f94c46 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -733,8 +733,8 @@ static void verity_prefetch_io(struct work_struct *work) hash_block_start &= ~(sector_t)(cluster - 1); hash_block_end |= cluster - 1; - if (unlikely(hash_block_end >= v->hash_blocks)) - hash_block_end = v->hash_blocks - 1; + if (unlikely(hash_block_end >= v->hash_end)) + hash_block_end = v->hash_end - 1; } no_prefetch_cluster: dm_bufio_prefetch_with_ioprio(v->bufio, hash_block_start, @@ -1011,13 +1011,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dm_verity *v = ti->private; - if (limits->logical_block_size < 1 << v->data_dev_block_bits) - limits->logical_block_size = 1 << v->data_dev_block_bits; - - if (limits->physical_block_size < 1 << v->data_dev_block_bits) - limits->physical_block_size = 1 << v->data_dev_block_bits; - - limits->io_min = limits->logical_block_size; + dm_stack_bs_limits(limits, 1 << v->data_dev_block_bits); /* * Similar to what dm-crypt does, opt dm-verity out of support for @@ -1607,7 +1601,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } hash_position += s; } - v->hash_blocks = hash_position; + v->hash_end = hash_position; r = mempool_init_page_pool(&v->recheck_pool, 1, 0); if (unlikely(r)) { @@ -1634,7 +1628,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + if (dm_bufio_get_device_size(v->bufio) < v->hash_end) { ti->error = "Hash device is too small"; r = -E2BIG; goto bad; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index d6bfabb27113..2922263501f6 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -53,9 +53,9 @@ struct dm_verity { unsigned int sig_size; /* root digest signature size */ #endif /* CONFIG_SECURITY */ unsigned int salt_size; - sector_t hash_start; /* hash start in blocks */ + sector_t hash_start; /* index of first hash block on hash_dev */ + sector_t hash_end; /* 1 + index of last hash block on hash dev */ sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ unsigned char data_dev_block_bits; /* log2(data blocksize) */ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 98bd945f6da7..493f5202ad04 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1640,17 +1640,9 @@ static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limit { struct dm_writecache *wc = ti->private; - if (limits->logical_block_size < wc->block_size) - limits->logical_block_size = wc->block_size; - - if (limits->physical_block_size < wc->block_size) - limits->physical_block_size = wc->block_size; - - if (limits->io_min < wc->block_size) - limits->io_min = wc->block_size; + dm_stack_bs_limits(limits, wc->block_size); } - static void writecache_writeback_endio(struct bio *bio) { struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index bf398d7476b3..9e7e6b1a6f15 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -208,6 +208,20 @@ enum llbitmap_state { BitNeedSync, /* data is synchronizing */ BitSyncing, + /* + * Proactive sync requested for unwritten region (raid456 only). + * Triggered via sysfs when user wants to pre-build XOR parity + * for regions that have never been written. + */ + BitNeedSyncUnwritten, + /* Proactive sync in progress for unwritten region */ + BitSyncingUnwritten, + /* + * XOR parity has been pre-built for a region that has never had + * user data written. When user writes to this region, it transitions + * to BitDirty. + */ + BitCleanUnwritten, BitStateCount, BitNone = 0xff, }; @@ -232,6 +246,12 @@ enum llbitmap_action { * BitNeedSync. */ BitmapActionStale, + /* + * Proactive sync trigger for raid456 - builds XOR parity for + * Unwritten regions without requiring user data write first. + */ + BitmapActionProactiveSync, + BitmapActionClearUnwritten, BitmapActionCount, /* Init state is BitUnwritten */ BitmapActionInit, @@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitNone, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNeedSyncUnwritten, + [BitmapActionClearUnwritten] = BitNone, }, [BitClean] = { [BitmapActionStartwrite] = BitDirty, @@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitDirty] = { [BitmapActionStartwrite] = BitNone, @@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitClean, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitNeedSync] = { [BitmapActionStartwrite] = BitNone, @@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitSyncing] = { [BitmapActionStartwrite] = BitNone, @@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, + }, + [BitNeedSyncUnwritten] = { + [BitmapActionStartwrite] = BitNeedSync, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitSyncingUnwritten] = { + [BitmapActionStartwrite] = BitSyncing, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitCleanUnwritten, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitCleanUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, }, }; @@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; break; case BitClean: + case BitCleanUnwritten: pctl->state[pos] = BitDirty; break; } @@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, } static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, - int offset) + int offset, bool infect) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; unsigned int io_size = llbitmap->io_size; @@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, * resync all the dirty bits, hence skip infect new dirty bits to * prevent resync unnecessary data. */ - if (llbitmap->mddev->degraded) { + if (llbitmap->mddev->degraded || !infect) { set_bit(block, pctl->dirty); return; } @@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, llbitmap->pctl[idx]->state[bit] = state; if (state == BitDirty || state == BitNeedSync) - llbitmap_set_page_dirty(llbitmap, idx, bit); + llbitmap_set_page_dirty(llbitmap, idx, bit, true); + else if (state == BitNeedSyncUnwritten) + llbitmap_set_page_dirty(llbitmap, idx, bit, false); } static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) @@ -459,7 +528,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) rdev_for_each(rdev, mddev) { sector_t sector; - if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) continue; sector = mddev->bitmap_info.offset + @@ -584,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap) return 0; } +/* + * Check if all underlying disks support write_zeroes with unmap. + */ +static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0) + return false; + } + + return true; +} + +/* + * Issue write_zeroes to all underlying disks to zero their data regions. + * This ensures parity consistency for RAID-456 (0 XOR 0 = 0). + * Returns true if all disks were successfully zeroed. + */ +static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + sector_t dev_sectors = mddev->dev_sectors; + int ret; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + ret = blkdev_issue_zeroout(rdev->bdev, + rdev->data_offset, + dev_sectors, + GFP_KERNEL, 0); + if (ret) { + pr_warn("md/llbitmap: failed to zero disk %pg: %d\n", + rdev->bdev, ret); + return false; + } + } + + return true; +} + static void llbitmap_init_state(struct llbitmap *llbitmap) { + struct mddev *mddev = llbitmap->mddev; enum llbitmap_state state = BitUnwritten; unsigned long i; - if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) { state = BitClean; + } else if (raid_is_456(mddev) && + llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) { + /* + * All disks support write_zeroes with unmap. Zero all disks + * to ensure parity consistency, then set BitCleanUnwritten + * to skip initial sync. + */ + if (llbitmap_zero_all_disks(llbitmap)) + state = BitCleanUnwritten; + } for (i = 0; i < llbitmap->chunks; i++) llbitmap_write(llbitmap, state, i); @@ -626,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, goto write_bitmap; } - if (c == BitNeedSync) + if (c == BitNeedSync || c == BitNeedSyncUnwritten) need_resync = !mddev->degraded; state = state_machine[c][action]; - write_bitmap: if (unlikely(mddev->degraded)) { /* For degraded array, mark new data as need sync. */ @@ -657,8 +786,7 @@ write_bitmap: } llbitmap_write(llbitmap, state, start); - - if (state == BitNeedSync) + if (state == BitNeedSync || state == BitNeedSyncUnwritten) need_resync = !mddev->degraded; else if (state == BitDirty && !timer_pending(&llbitmap->pending_timer)) @@ -1069,12 +1197,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); } static void llbitmap_end_write(struct mddev *mddev, sector_t offset, @@ -1101,12 +1229,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); } static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, @@ -1228,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) unsigned long p = offset >> llbitmap->chunkshift; enum llbitmap_state c = llbitmap_read(llbitmap, p); - return c == BitClean || c == BitDirty; + return c == BitClean || c == BitDirty || c == BitCleanUnwritten; } static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) @@ -1242,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) if (c == BitUnwritten) return blocks; + /* Skip CleanUnwritten - no user data, will be reset after recovery */ + if (c == BitCleanUnwritten) + return blocks; + /* For degraded array, don't skip */ if (mddev->degraded) return 0; @@ -1260,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state state; + + /* + * Before recovery starts, convert CleanUnwritten to Unwritten. + * This ensures the new disk won't have stale parity data. + */ + if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionClearUnwritten); + /* * Handle one bit at a time, this is much simpler. And it doesn't matter * if md_do_sync() loop more times. */ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); - return llbitmap_state_machine(llbitmap, p, p, - BitmapActionStartsync) == BitSyncing; + state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync); + return state == BitSyncing || state == BitSyncingUnwritten; } /* Something is wrong, sync_thread stop at @offset */ @@ -1473,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page) } mutex_unlock(&mddev->bitmap_info.mutex); - return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + return sprintf(page, + "unwritten %d\nclean %d\ndirty %d\n" + "need sync %d\nsyncing %d\n" + "need sync unwritten %d\nsyncing unwritten %d\n" + "clean unwritten %d\n", bits[BitUnwritten], bits[BitClean], bits[BitDirty], - bits[BitNeedSync], bits[BitSyncing]); + bits[BitNeedSync], bits[BitSyncing], + bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten], + bits[BitCleanUnwritten]); } static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); @@ -1548,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); +static ssize_t +proactive_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap; + + /* Only for RAID-456 */ + if (!raid_is_456(mddev)) + return -EINVAL; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return -ENODEV; + } + + /* Trigger proactive sync on all Unwritten regions */ + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionProactiveSync); + + mutex_unlock(&mddev->bitmap_info.mutex); + return len; +} + +static struct md_sysfs_entry llbitmap_proactive_sync = + __ATTR(proactive_sync, 0200, NULL, proactive_sync_store); + static struct attribute *md_llbitmap_attrs[] = { &llbitmap_bits.attr, &llbitmap_metadata.attr, &llbitmap_daemon_sleep.attr, &llbitmap_barrier_idle.attr, + &llbitmap_proactive_sync.attr, NULL }; diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce6f9e9d38e..5fb5ae8368ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); -static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for @@ -98,7 +97,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev); static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* @@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev) spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; @@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) } percpu_ref_kill(&mddev->active_io); + + /* + * RAID456 IO can sleep in wait_for_reshape while still holding an + * active_io reference. If reshape is already interrupted or frozen, + * wake those waiters so they can abort and drop the reference instead + * of deadlocking suspend. + */ + if (mddev->pers && mddev->pers->prepare_suspend && + reshape_interrupted(mddev)) + mddev->pers->prepare_suspend(mddev); + if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); @@ -959,7 +968,7 @@ void mddev_unlock(struct mddev *mddev) list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); - export_rdev(rdev, mddev); + export_rdev(rdev); } if (!legacy_async_del_gendisk) { @@ -2632,7 +2641,7 @@ void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) +static void export_rdev(struct md_rdev *rdev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); @@ -2788,7 +2797,9 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); + if (!mddev_is_dm(mddev)) + pr_err_ratelimited("%s: can't update sb for read-only array %s\n", + __func__, mdname(mddev)); return; } @@ -4848,7 +4859,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); @@ -6128,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); - mddev_put(mddev); + /* + * For "array_state=clear", dropping the extra kobject reference from + * sysfs_break_active_protection() can trigger md kobject deletion. + * Restore active protection before mddev_put() so deletion happens + * after the sysfs write path fully unwinds. + */ if (kn) sysfs_unbreak_active_protection(kn); + mddev_put(mddev); return rv; } @@ -6447,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +/* + * Read bitmap superblock and return the bitmap_id based on disk version. + * This is used as fallback when default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set bitmap_type. + */ +static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct page *sb_page; + bitmap_super_t *sb; + enum md_submodule_id id = ID_BITMAP_NONE; + sector_t sector; + u32 version; + + if (!mddev->bitmap_info.offset) + return ID_BITMAP_NONE; + + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) { + pr_warn("md: %s: failed to allocate memory for bitmap\n", + mdname(mddev)); + return ID_BITMAP_NONE; + } + + sector = mddev->bitmap_info.offset; + + rdev_for_each(rdev, mddev) { + u32 iosize; + + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; + + iosize = roundup(sizeof(bitmap_super_t), + bdev_logical_block_size(rdev->bdev)); + if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ, + true)) + goto read_ok; + } + pr_warn("md: %s: failed to read bitmap from any device\n", + mdname(mddev)); + goto out; + +read_ok: + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_warn("md: %s: invalid bitmap magic 0x%x\n", + mdname(mddev), le32_to_cpu(sb->magic)); + goto out_unmap; + } + + version = le32_to_cpu(sb->version); + switch (version) { + case BITMAP_MAJOR_LO: + case BITMAP_MAJOR_HI: + case BITMAP_MAJOR_CLUSTERED: + id = ID_BITMAP; + break; + case BITMAP_MAJOR_LOCKLESS: + id = ID_LLBITMAP; + break; + default: + pr_warn("md: %s: unknown bitmap version %u\n", + mdname(mddev), version); + break; + } + +out_unmap: + kunmap_local(sb); +out: + __free_page(sb_page); + return id; +} + static int md_bitmap_create(struct mddev *mddev) { + enum md_submodule_id orig_id = mddev->bitmap_id; + enum md_submodule_id sb_id; + int err; + if (mddev->bitmap_id == ID_BITMAP_NONE) return -EINVAL; if (!mddev_set_bitmap_ops(mddev)) return -ENOENT; - return mddev->bitmap_ops->create(mddev); + err = mddev->bitmap_ops->create(mddev); + if (!err) + return 0; + + /* + * Create failed, if default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set + * bitmap_type, set bitmap_ops based on the disk version. + */ + mddev_clear_bitmap_ops(mddev); + + sb_id = md_bitmap_get_id_from_sb(mddev); + if (sb_id == ID_BITMAP_NONE || sb_id == orig_id) + return err; + + pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n", + mdname(mddev), orig_id, sb_id); + + mddev->bitmap_id = sb_id; + if (!mddev_set_bitmap_ops(mddev)) { + mddev->bitmap_id = orig_id; + return -ENOENT; + } + + err = mddev->bitmap_ops->create(mddev); + if (err) { + mddev_clear_bitmap_ops(mddev); + mddev->bitmap_id = orig_id; + } + + return err; } static void md_bitmap_destroy(struct mddev *mddev) @@ -7140,7 +7266,7 @@ static void autorun_devices(int part) rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev, mddev); + export_rdev(rdev); } autorun_array(mddev); mddev_unlock_and_resume(mddev); @@ -7150,7 +7276,7 @@ static void autorun_devices(int part) */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev, mddev); + export_rdev(rdev); } mddev_put(mddev); } @@ -7338,13 +7464,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } @@ -7387,7 +7513,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } @@ -7413,7 +7539,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) } } if (has_journal || mddev->bitmap) { - export_rdev(rdev, mddev); + export_rdev(rdev); return -EBUSY; } set_bit(Journal, &rdev->flags); @@ -7428,7 +7554,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* --add initiated by this node */ err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7438,7 +7564,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { @@ -7501,7 +7627,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7613,7 +7739,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return 0; abort_export: - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } @@ -10503,10 +10629,6 @@ static int __init md_init(void) goto err_bitmap; ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0); - if (!md_wq) - goto err_wq; - md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); if (!md_misc_wq) goto err_misc_wq; @@ -10531,8 +10653,6 @@ err_mdp: err_md: destroy_workqueue(md_misc_wq); err_misc_wq: - destroy_workqueue(md_wq); -err_wq: md_llbitmap_exit(); err_bitmap: md_bitmap_exit(); @@ -10841,7 +10961,6 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_wq); md_bitmap_exit(); } diff --git a/drivers/md/md.h b/drivers/md/md.h index ac84289664cd..d6f5482e2479 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,6 @@ enum sync_action { struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; - wait_queue_head_t serial_io_wait; }; /* @@ -381,7 +380,11 @@ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ + sector_t wnode_start; /* address of waiting nodes on the same list */ sector_t _subtree_last; /* highest sector in subtree of rb node */ + struct list_head list_node; + struct list_head waiters; + struct completion ready; }; /* diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef0045db409f..5e38a51e349a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } err = -ENOMEM; - conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones); + conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones); if (!conf->strip_zone) goto abort; - conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *), - conf->nr_strip_zones, - mddev->raid_disks), - GFP_KERNEL); + conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *), + conf->nr_strip_zones, + mddev->raid_disks), + GFP_KERNEL); if (!conf->devlist) goto abort; @@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) return 0; abort: - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); *private_conf = ERR_PTR(err); return err; @@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 181400e147c0..ba91f7e61920 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, - struct serial_info *si, int idx) + struct serial_info *si) { unsigned long flags; int ret = 0; sector_t lo = r1_bio->sector; - sector_t hi = lo + r1_bio->sectors; + sector_t hi = lo + r1_bio->sectors - 1; + int idx = sector_to_idx(r1_bio->sector); struct serial_in_rdev *serial = &rdev->serial[idx]; + struct serial_info *head_si; spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ - if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + if (head_si && head_si != si) { + si->start = lo; + si->last = hi; + si->wnode_start = head_si->wnode_start; + list_add_tail(&si->list_node, &head_si->waiters); ret = -EBUSY; - else { + } else if (!head_si) { si->start = lo; si->last = hi; + si->wnode_start = si->start; raid1_rb_insert(si, &serial->serial_rb); } spin_unlock_irqrestore(&serial->serial_lock, flags); @@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) { struct mddev *mddev = rdev->mddev; struct serial_info *si; - int idx = sector_to_idx(r1_bio->sector); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (WARN_ON(!mddev->serial_info_pool)) return; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, r1_bio, si, idx) == 0); + INIT_LIST_HEAD(&si->waiters); + INIT_LIST_HEAD(&si->list_node); + init_completion(&si->ready); + while (check_and_add_serial(rdev, r1_bio, si)) { + wait_for_completion(&si->ready); + reinit_completion(&si->ready); + } } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *si; + struct serial_info *si, *iter_si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; @@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); si; si = raid1_rb_iter_next(si, lo, hi)) { if (si->start == lo && si->last == hi) { - raid1_rb_remove(si, &serial->serial_rb); - mempool_free(si, mddev->serial_info_pool); found = 1; break; } } - if (!found) + if (found) { + raid1_rb_remove(si, &serial->serial_rb); + if (!list_empty(&si->waiters)) { + list_for_each_entry(iter_si, &si->waiters, list_node) { + if (iter_si->wnode_start == si->wnode_start) { + list_del_init(&iter_si->list_node); + list_splice_init(&si->waiters, &iter_si->waiters); + raid1_rb_insert(iter_si, &serial->serial_rb); + complete(&iter_si->ready); + break; + } + } + } + mempool_free(si, mddev->serial_info_pool); + } else { WARN(1, "The write IO is not recorded for serialization\n"); + } spin_unlock_irqrestore(&serial->serial_lock, flags); - wake_up(&serial->serial_io_wait); } /* @@ -452,7 +475,7 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; + sector_t hi = r1_bio->sector + r1_bio->sectors - 1; bool ignore_error = !raid1_should_handle_error(bio) || (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); @@ -1878,7 +1901,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, if (info->rdev) return false; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { set_bit(Nonrot, &rdev->flags); WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0653b5d8545a..4901ebe45c87 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = bdev_nonrot(rdev->bdev); + nonrot = !bdev_rot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { @@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } @@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sectors = r10_bio->sectors; if (!regular_request_wait(mddev, conf, bio, sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 66b10cbda96d..7b7546bfa21f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, return -ENOMEM; while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; + payload = (void *)mb + mb_offset; payload_flush = (void *)mb + mb_offset; if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) @@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, payload->checksum[1]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ goto mismatch; - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); - } else { - /* DATA or PARITY payload */ + if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) { log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); } - + mb_offset += payload_len; } put_page(page); @@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; int dd; payload = (void *)mb + mb_offset; @@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { int i, count; + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > + le32_to_cpu(mb->meta_size)) + return -EINVAL; + count = le32_to_cpu(payload_flush->size) / sizeof(__le64); for (i = 0; i < count; ++i) { stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); @@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, } } - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); + mb_offset += payload_len; continue; } /* DATA or PARITY payload */ + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + return -EINVAL; + stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? raid5_compute_sector( conf, le64_to_cpu(payload->location), 0, &dd, @@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + mb_offset += payload_len; } return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..6e79829c5acb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, break; } BUG_ON(other < 0); + if (test_bit(R5_LOCKED, &sh->dev[other].flags)) + return 0; pr_debug("Computing stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, disk_idx, other); @@ -4594,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) async_tx_quiesce(&tx); } -/* - * handle_stripe - do things to a stripe. - * - * We lock the stripe by setting STRIPE_ACTIVE and then examine the - * state of various bits to see what needs to be done. - * Possible results: - * return some read requests which now have data - * return some write requests which are safely on storage - * schedule a read on some buffers - * schedule a write of some buffers - * return confirmation of parity correctness - * - */ - static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) { struct r5conf *conf = sh->raid_conf; @@ -4901,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_bit(STRIPE_HANDLE, &head_sh->state); } +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. + * Possible results: + * return some read requests which now have data + * return some write requests which are safely on storage + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + */ static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; @@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - raid5_release_stripe(sh); + int hash; + + spin_lock_irq(&conf->device_lock); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, + &conf->temp_inactive_list[hash]); + spin_unlock_irq(&conf->device_lock); conf->retry_read_aligned = raid_bio; conf->retry_read_offset = scnt; return handled; @@ -7541,7 +7547,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } @@ -7780,6 +7786,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); + lim.chunk_sectors = lim.io_opt >> 9; lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 110b1c2d0a86..1c7b710fc9c1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx) } #endif -void md_raid5_kick_device(struct r5conf *conf); int raid5_set_cache_size(struct mddev *mddev, int size); sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); void raid5_release_stripe(struct stripe_head *sh); |
