diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 18:14:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-07 18:14:36 -0700 |
commit | 67a242223958d628f0ba33283668e3ddd192d057 (patch) | |
tree | a39e7039e9a2ef9ab46f8ba561175dbdc6101d11 /drivers/md | |
parent | 8b35ad6232c462b02e397e87ce702bcddd4ba543 (diff) | |
parent | b8753433fc611e23e31300e1d099001a08955c88 (diff) |
Merge tag 'for-5.2/block-20190507' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"Nothing major in this series, just fixes and improvements all over the
map. This contains:
- Series of fixes for sed-opal (David, Jonas)
- Fixes and performance tweaks for BFQ (via Paolo)
- Set of fixes for bcache (via Coly)
- Set of fixes for md (via Song)
- Enabling multi-page for passthrough requests (Ming)
- Queue release fix series (Ming)
- Device notification improvements (Martin)
- Propagate underlying device rotational status in loop (Holger)
- Removal of mtip32xx trim support, which has been disabled for years
(Christoph)
- Improvement and cleanup of nvme command handling (Christoph)
- Add block SPDX tags (Christoph)
- Cleanup/hardening of bio/bvec iteration (Christoph)
- A few NVMe pull requests (Christoph)
- Removal of CONFIG_LBDAF (Christoph)
- Various little fixes here and there"
* tag 'for-5.2/block-20190507' of git://git.kernel.dk/linux-block: (164 commits)
block: fix mismerge in bvec_advance
block: don't drain in-progress dispatch in blk_cleanup_queue()
blk-mq: move cancel of hctx->run_work into blk_mq_hw_sysfs_release
blk-mq: always free hctx after request queue is freed
blk-mq: split blk_mq_alloc_and_init_hctx into two parts
blk-mq: free hw queue's resource in hctx's release handler
blk-mq: move cancel of requeue_work into blk_mq_release
blk-mq: grab .q_usage_counter when queuing request from plug code path
block: fix function name in comment
nvmet: protect discovery change log event list iteration
nvme: mark nvme_core_init and nvme_core_exit static
nvme: move command size checks to the core
nvme-fabrics: check more command sizes
nvme-pci: check more command sizes
nvme-pci: remove an unneeded variable initialization
nvme-pci: unquiesce admin queue on shutdown
nvme-pci: shutdown on timeout during deletion
nvme-pci: fix psdt field for single segment sgls
nvme-multipath: don't print ANA group state by default
nvme-multipath: split bios with the ns_head bio_set before submitting
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 42 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 41 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 84 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 26 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.h | 28 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 8 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 8 | ||||
-rw-r--r-- | drivers/md/md.c | 199 | ||||
-rw-r--r-- | drivers/md/md.h | 25 | ||||
-rw-r--r-- | drivers/md/raid1.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 16 |
16 files changed, 275 insertions, 232 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 5002838ea476..f8986effcb50 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -327,10 +327,11 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (!fifo_empty(&ca->free_inc)) { + while (1) { long bucket; - fifo_pop(&ca->free_inc, bucket); + if (!fifo_pop(&ca->free_inc, bucket)) + break; if (ca->discard) { mutex_unlock(&ca->set->bucket_lock); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 64def336f053..773f5fdad25f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -429,14 +429,14 @@ static void do_btree_node_write(struct btree *b) bset_sector_offset(&b->keys, i)); if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { - int j; struct bio_vec *bv; - void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, b->bio, j, iter_all) - memcpy(page_address(bv->bv_page), - base + j * PAGE_SIZE, PAGE_SIZE); + bio_for_each_segment_all(bv, b->bio, iter_all) { + memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + addr += PAGE_SIZE; + } bch_submit_bbio(b->bio, b->c, &k.key, 0); @@ -1476,11 +1476,11 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, out_nocoalesce: closure_sync(&cl); - bch_keylist_free(&keylist); while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); + bch_keylist_free(&keylist); for (i = 0; i < nodes; i++) if (!IS_ERR_OR_NULL(new_nodes[i])) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index b2fd412715b1..12dae9348147 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, b); \ + ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) struct cache *ca; unsigned int iter; + int ret = 0; for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; @@ -267,7 +268,7 @@ bsearch: struct journal_replay, list)->j.seq; - return 0; + return ret; #undef read_bucket } @@ -317,6 +318,18 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } +static bool is_discard_enabled(struct cache_set *s) +{ + struct cache *ca; + unsigned int i; + + for_each_cache(ca, s, i) + if (ca->discard) + return true; + + return false; +} + int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -330,9 +343,17 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - cache_set_err_on(n != i->j.seq, s, -"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", - n, i->j.seq - 1, start, end); + if (n != i->j.seq) { + if (n == start && is_discard_enabled(s)) + pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); + else { + pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; + } + } for (k = i->j.start; k < bset_bkey_last(&i->j); @@ -540,11 +561,11 @@ static void journal_reclaim(struct cache_set *c) ca->sb.nr_this_dev); } - bkey_init(k); - SET_KEY_PTRS(k, n); - - if (n) + if (n) { + bkey_init(k); + SET_KEY_PTRS(k, n); c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; + } out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -671,6 +692,9 @@ static void journal_write_unlocked(struct closure *cl) ca->journal.seq[ca->journal.cur_idx] = w->data->seq; } + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + atomic_dec_bug(&fifo_back(&c->journal.pin)); bch_journal_next(&c->journal); journal_reclaim(c); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f101bfe8657a..41adcd1546f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -329,12 +329,13 @@ void bch_data_insert(struct closure *cl) bch_data_insert_start(cl); } -/* Congested? */ - -unsigned int bch_get_congested(struct cache_set *c) +/* + * Congested? Return 0 (not congested) or the limit (in sectors) + * beyond which we should bypass the cache due to congestion. + */ +unsigned int bch_get_congested(const struct cache_set *c) { int i; - long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -353,8 +354,7 @@ unsigned int bch_get_congested(struct cache_set *c) if (i > 0) i = fract_exp_two(i, 6); - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); + i -= hweight32(get_random_u32()); return i > 0 ? i : 1; } @@ -376,7 +376,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; unsigned int mode = cache_mode(dc); - unsigned int sectors, congested = bch_get_congested(c); + unsigned int sectors, congested; struct task_struct *task = current; struct io *i; @@ -412,6 +412,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto rescale; } + congested = bch_get_congested(c); if (!congested && !dc->sequential_cutoff) goto rescale; @@ -706,14 +707,14 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - atomic_dec(&s->d->c->search_inflight); + atomic_dec(&s->iop.c->search_inflight); if (s->iop.bio) bio_put(s->iop.bio); bio_complete(s); closure_debug_destroy(cl); - mempool_free(s, &s->d->c->search); + mempool_free(s, &s->iop.c->search); } static inline struct search *search_alloc(struct bio *bio, @@ -756,13 +757,13 @@ static void cached_dev_bio_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - search_free(cl); cached_dev_put(dc); + search_free(cl); } /* Process reads */ -static void cached_dev_cache_miss_done(struct closure *cl) +static void cached_dev_read_error_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); @@ -800,7 +801,22 @@ static void cached_dev_read_error(struct closure *cl) closure_bio_submit(s->iop.c, bio, cl); } - continue_at(cl, cached_dev_cache_miss_done, NULL); + continue_at(cl, cached_dev_read_error_done, NULL); +} + +static void cached_dev_cache_miss_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bcache_device *d = s->d; + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) + bio_free_pages(s->iop.bio); + + cached_dev_bio_complete(cl); + closure_put(&d->cl); } static void cached_dev_read_done(struct closure *cl) @@ -833,6 +849,7 @@ static void cached_dev_read_done(struct closure *cl) if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); + closure_get(&dc->disk.cl); bio_complete(s); if (s->iop.bio && diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 721bf336ed1a..c64dbd7a91aa 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -33,7 +33,7 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -unsigned int bch_get_congested(struct cache_set *c); +unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a697a3a923cd..1b63ac876169 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + /* + * closure_fn set to + * - cached device: cached_dev_flush() + * - flash dev: flash_dev_flush() + */ closure_queue(&d->cl); } @@ -906,21 +911,18 @@ static int cached_dev_status_update(void *arg) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; - char buf[SB_LABEL_SIZE + 1]; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL, + kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""), NULL, }; - memcpy(buf, dc->sb.label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE] = '\0'; - env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); + kfree(buf); return; } @@ -944,6 +946,7 @@ void bch_cached_dev_run(struct cached_dev *dc) kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); + kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) @@ -1174,6 +1177,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return 0; } +/* when dc->disk.kobj released */ void bch_cached_dev_release(struct kobject *kobj) { struct cached_dev *dc = container_of(kobj, struct cached_dev, @@ -1280,7 +1284,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static void register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { @@ -1318,14 +1322,16 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return; + return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); + return -EIO; } /* Flash only volumes */ +/* When d->kobj released */ void bch_flash_dev_release(struct kobject *kobj) { struct bcache_device *d = container_of(kobj, struct bcache_device, @@ -1496,6 +1502,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return true; } +/* When c->kobj released */ void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); @@ -1516,6 +1523,7 @@ static void cache_set_free(struct closure *cl) bch_btree_cache_free(c); bch_journal_free(c); + mutex_lock(&bch_register_lock); for_each_cache(ca, c, i) if (ca) { ca->set = NULL; @@ -1534,7 +1542,6 @@ static void cache_set_free(struct closure *cl) mempool_exit(&c->search); kfree(c->devices); - mutex_lock(&bch_register_lock); list_del(&c->list); mutex_unlock(&bch_register_lock); @@ -1673,6 +1680,7 @@ static void __cache_set_unregister(struct closure *cl) void bch_cache_set_stop(struct cache_set *c) { if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + /* closure_fn set to __cache_set_unregister() */ closure_queue(&c->caching); } @@ -1775,13 +1783,15 @@ err: return NULL; } -static void run_cache_set(struct cache_set *c) +static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; struct cache *ca; struct closure cl; unsigned int i; + LIST_HEAD(journal); + struct journal_replay *l; closure_init_stack(&cl); @@ -1790,7 +1800,6 @@ static void run_cache_set(struct cache_set *c) set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { - LIST_HEAD(journal); struct bkey *k; struct jset *j; @@ -1869,7 +1878,9 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal); + err = "bcache: replay journal failed"; + if (bch_journal_replay(c, &journal)) + goto err; } else { pr_notice("invalidating existing data"); @@ -1937,11 +1948,19 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); set_bit(CACHE_SET_RUNNING, &c->flags); - return; + return 0; err: + while (!list_empty(&journal)) { + l = list_first_entry(&journal, struct journal_replay, list); + list_del(&l->list); + kfree(l); + } + closure_sync(&cl); /* XXX: test this, it's broken */ bch_cache_set_error(c, "%s", err); + + return -EIO; } static bool can_attach_cache(struct cache *ca, struct cache_set *c) @@ -2005,8 +2024,11 @@ found: ca->set->cache[ca->sb.nr_this_dev] = ca; c->cache_by_alloc[c->caches_loaded++] = ca; - if (c->caches_loaded == c->sb.nr_in_set) - run_cache_set(c); + if (c->caches_loaded == c->sb.nr_in_set) { + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; + } return NULL; err: @@ -2016,6 +2038,7 @@ err: /* Cache device */ +/* When ca->kobj released */ void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -2179,6 +2202,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ret = cache_alloc(ca); if (ret != 0) { + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call blkdev_put() to bdev in bch_cache_release(). So we + * explicitly call blkdev_put() here. + */ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; @@ -2262,7 +2291,7 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = size; + ssize_t ret = -EINVAL; const char *err = "cannot allocate memory"; char *path = NULL; struct cache_sb *sb = NULL; @@ -2296,7 +2325,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto out; + goto quiet_out; } goto err; } @@ -2317,17 +2346,23 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto err_close; mutex_lock(&bch_register_lock); - register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + /* blkdev_put() will be called in cached_dev_free() */ + if (ret < 0) + goto err; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) goto err_close; + /* blkdev_put() will be called in bch_cache_release() */ if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; } +quiet_out: + ret = size; out: if (sb_page) put_page(sb_page); @@ -2340,7 +2375,6 @@ err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: pr_info("error %s: %s", path, err); - ret = -EINVAL; goto out; } @@ -2370,10 +2404,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); + mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and + * kworkers to stop themselves + */ + schedule(); + /* What's a condition variable? */ while (1) { - long timeout = start + 2 * HZ - jiffies; + long timeout = start + 10 * HZ - jiffies; + mutex_lock(&bch_register_lock); stopped = list_empty(&bch_cache_sets) && list_empty(&uncached_devices); @@ -2385,7 +2428,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_unlock(&bch_register_lock); schedule_timeout(timeout); - mutex_lock(&bch_register_lock); } finish_wait(&unregister_wait, &wait); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 17bae9c14ca0..6cd44d3cf906 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -996,8 +996,6 @@ SHOW(__bch_cache) !cached[n - 1]) --n; - unused = ca->sb.nbuckets - n; - while (cached < p + n && *cached == BTREE_PRIO) cached++, n--; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 00aab6abcfe4..1fbced94e4cc 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -560,17 +560,29 @@ static inline uint64_t bch_crc64_update(uint64_t crc, return crc; } -/* Does linear interpolation between powers of two */ +/* + * A stepwise-linear pseudo-exponential. This returns 1 << (x >> + * frac_bits), with the less-significant bits filled in by linear + * interpolation. + * + * This can also be interpreted as a floating-point number format, + * where the low frac_bits are the mantissa (with implicit leading + * 1 bit), and the more significant bits are the exponent. + * The return value is 1.mantissa * 2^exponent. + * + * The way this is used, fract_bits is 6 and the largest possible + * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc), + * so the maximum output is 0x1fc00. + */ static inline unsigned int fract_exp_two(unsigned int x, unsigned int fract_bits) { - unsigned int fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; + unsigned int mantissa = 1 << fract_bits; /* Implicit bit */ - return x; + mantissa += x & (mantissa - 1); + x >>= fract_bits; /* The exponent */ + /* Largest intermediate value 0x7f0000 */ + return mantissa << x >> fract_bits; } void bch_bio_map(struct bio *bio, void *base); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9faed1c92b52..7f6462f74ac8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1442,11 +1442,10 @@ out: static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) { - unsigned int i; struct bio_vec *bv; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, clone, i, iter_all) { + bio_for_each_segment_all(bv, clone, iter_all) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, &cc->page_pool); } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 12b5216c2cfe..721efc493942 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -135,9 +135,8 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); /* * Funtions to manipulate consecutive chunks */ -# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) -# define DM_CHUNK_CONSECUTIVE_BITS 8 -# define DM_CHUNK_NUMBER_BITS 56 +#define DM_CHUNK_CONSECUTIVE_BITS 8 +#define DM_CHUNK_NUMBER_BITS 56 static inline chunk_t dm_chunk_number(chunk_t chunk) { @@ -163,29 +162,6 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); } -# else -# define DM_CHUNK_CONSECUTIVE_BITS 0 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk; -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) -{ - return 0; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) -{ -} - -static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) -{ -} - -# endif - /* * Return the number of sectors in the device. */ diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 95ae4bf34203..c27c32cf4a30 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -88,14 +88,10 @@ struct journal_entry { #if BITS_PER_LONG == 64 #define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) -#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) -#elif defined(CONFIG_LBDAF) -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) -#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #else -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) -#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) #endif +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) #define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) #define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1cd4f991792c..3a62a46b75c7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -490,10 +490,10 @@ void md_bitmap_print_sb(struct bitmap *bitmap) pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %d\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", - le32_to_cpu(*(__u32 *)(sb->uuid+0)), - le32_to_cpu(*(__u32 *)(sb->uuid+4)), - le32_to_cpu(*(__u32 *)(sb->uuid+8)), - le32_to_cpu(*(__u32 *)(sb->uuid+12))); + le32_to_cpu(*(__le32 *)(sb->uuid+0)), + le32_to_cpu(*(__le32 *)(sb->uuid+4)), + le32_to_cpu(*(__le32 *)(sb->uuid+8)), + le32_to_cpu(*(__le32 *)(sb->uuid+12))); pr_debug(" events: %llu\n", (unsigned long long) le64_to_cpu(sb->events)); pr_debug("events cleared: %llu\n", diff --git a/drivers/md/md.c b/drivers/md/md.c index 05ffffb8b769..45ffa23fa85d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -88,8 +88,7 @@ static struct kobj_type md_ktype; struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); -struct module *md_cluster_mod; -EXPORT_SYMBOL(md_cluster_mod); +static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -132,24 +131,6 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static void * flush_info_alloc(gfp_t gfp_flags, void *data) -{ - return kzalloc(sizeof(struct flush_info), gfp_flags); -} -static void flush_info_free(void *flush_info, void *data) -{ - kfree(flush_info); -} - -static void * flush_bio_alloc(gfp_t gfp_flags, void *data) -{ - return kzalloc(sizeof(struct flush_bio), gfp_flags); -} -static void flush_bio_free(void *flush_bio, void *data) -{ - kfree(flush_bio); -} - static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -423,54 +404,31 @@ static int md_congested(void *data, int bits) /* * Generic flush handling for md */ -static void submit_flushes(struct work_struct *ws) -{ - struct flush_info *fi = container_of(ws, struct flush_info, flush_work); - struct mddev *mddev = fi->mddev; - struct bio *bio = fi->bio; - - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); - - mempool_free(fi, mddev->flush_pool); -} -static void md_end_flush(struct bio *fbio) +static void md_end_flush(struct bio *bio) { - struct flush_bio *fb = fbio->bi_private; - struct md_rdev *rdev = fb->rdev; - struct flush_info *fi = fb->fi; - struct bio *bio = fi->bio; - struct mddev *mddev = fi->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - mempool_free(fi, mddev->flush_pool); - } else { - INIT_WORK(&fi->flush_work, submit_flushes); - queue_work(md_wq, &fi->flush_work); - } + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); } - - mempool_free(fb, mddev->flush_bio_pool); - bio_put(fbio); + bio_put(bio); } -void md_flush_request(struct mddev *mddev, struct bio *bio) +static void md_submit_flush_data(struct work_struct *ws); + +static void submit_flushes(struct work_struct *ws) { + struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; - struct flush_info *fi; - - fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); - - fi->bio = bio; - fi->mddev = mddev; - atomic_set(&fi->flush_pending, 1); + mddev->start_flush = ktime_get_boottime(); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && @@ -480,37 +438,74 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) * we reclaim rcu_read_lock */ struct bio *bi; - struct flush_bio *fb; atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - - fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); - fb->fi = fi; - fb->rdev = rdev; - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bio_set_dev(bi, rdev->bdev); bi->bi_end_io = md_end_flush; - bi->bi_private = fb; + bi->bi_private = rdev; + bio_set_dev(bi, rdev->bdev); bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - atomic_inc(&fi->flush_pending); + atomic_inc(&mddev->flush_pending); submit_bio(bi); - rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} + +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->last_flush = mddev->start_flush; + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + + if (bio->bi_iter.bi_size == 0) { + /* an empty barrier - all done */ + bio_endio(bio); + } else { + bio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, bio); + } +} - if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) { +void md_flush_request(struct mddev *mddev, struct bio *bio) +{ + ktime_t start = ktime_get_boottime(); + spin_lock_irq(&mddev->lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio || + ktime_after(mddev->last_flush, start), + mddev->lock); + if (!ktime_after(mddev->last_flush, start)) { + WARN_ON(mddev->flush_bio); + mddev->flush_bio = bio; + bio = NULL; + } + spin_unlock_irq(&mddev->lock); + + if (!bio) { + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); + } else { + /* flush was performed for some other bio while we waited. */ + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); - mempool_free(fi, mddev->flush_pool); - } else { - INIT_WORK(&fi->flush_work, submit_flushes); - queue_work(md_wq, &fi->flush_work); + else { + bio->bi_opf &= ~REQ_PREFLUSH; + mddev->pers->make_request(mddev, bio); } } } @@ -560,6 +555,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); + atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -1109,8 +1105,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && - sb->level >= 1) + if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) @@ -1408,8 +1403,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && - rdev->mddev->level >= 1) + if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -1553,7 +1547,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ */ s32 offset; sector_t bb_sector; - u64 *bbp; + __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) @@ -1565,7 +1559,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ if (!sync_page_io(rdev, bb_sector, sectors << 9, rdev->bb_page, REQ_OP_READ, 0, true)) return -EIO; - bbp = (u64 *)page_address(rdev->bb_page); + bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); @@ -1877,7 +1871,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; - u64 *bbp = (u64 *)page_address(rdev->bb_page); + __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { @@ -2855,8 +2849,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } } else if (cmd_match(buf, "re-add")) { - if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && - rdev->saved_raid_disk >= 0) { + if (!rdev->mddev->pers) + err = -EINVAL; + else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they @@ -3384,10 +3380,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - rv = mddev ? mddev_lock(mddev): -EBUSY; + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) - rv = -EBUSY; + rv = -ENODEV; else rv = entry->store(rdev, page, length); mddev_unlock(mddev); @@ -5511,22 +5507,6 @@ int md_run(struct mddev *mddev) if (err) return err; } - if (mddev->flush_pool == NULL) { - mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, - flush_info_free, mddev); - if (!mddev->flush_pool) { - err = -ENOMEM; - goto abort; - } - } - if (mddev->flush_bio_pool == NULL) { - mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, - flush_bio_free, mddev); - if (!mddev->flush_bio_pool) { - err = -ENOMEM; - goto abort; - } - } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5686,11 +5666,8 @@ int md_run(struct mddev *mddev) return 0; abort: - mempool_destroy(mddev->flush_bio_pool); - mddev->flush_bio_pool = NULL; - mempool_destroy(mddev->flush_pool); - mddev->flush_pool = NULL; - + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -5894,14 +5871,6 @@ static void __md_stop(struct mddev *mddev) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->flush_bio_pool) { - mempool_destroy(mddev->flush_bio_pool); - mddev->flush_bio_pool = NULL; - } - if (mddev->flush_pool) { - mempool_destroy(mddev->flush_pool); - mddev->flush_pool = NULL; - } } void md_stop(struct mddev *mddev) @@ -9257,7 +9226,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * reshape is happening in the remote node, we need to * update reshape_position and call start_reshape. */ - mddev->reshape_position = sb->reshape_position; + mddev->reshape_position = le64_to_cpu(sb->reshape_position); if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); if (mddev->pers->start_reshape) diff --git a/drivers/md/md.h b/drivers/md/md.h index c52afb52c776..257cb4c9e22b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -252,19 +252,6 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_FLUSH_INFOS 8 -#define NR_FLUSH_BIOS 64 -struct flush_info { - struct bio *bio; - struct mddev *mddev; - struct work_struct flush_work; - atomic_t flush_pending; -}; -struct flush_bio { - struct flush_info *fi; - struct md_rdev *rdev; -}; - struct mddev { void *private; struct md_personality *pers; @@ -470,8 +457,16 @@ struct mddev { * metadata and bitmap writes */ - mempool_t *flush_pool; - mempool_t *flush_bio_pool; + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_PREFLUSH flag). + */ + struct bio *flush_bio; + atomic_t flush_pending; + ktime_t start_flush, last_flush; /* last_flush is when the last completed + * flush was started. + */ + struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fdf451aac369..0c8a098d220e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2110,7 +2110,7 @@ static void process_checks(struct r1bio *r1_bio) } r1_bio->read_disk = primary; for (i = 0; i < conf->raid_disks * 2; i++) { - int j; + int j = 0; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; blk_status_t status = sbio->bi_status; @@ -2125,8 +2125,8 @@ static void process_checks(struct r1bio *r1_bio) /* Now we can 'fixup' the error value */ sbio->bi_status = 0; - bio_for_each_segment_all(bi, sbio, j, iter_all) - page_len[j] = bi->bv_len; + bio_for_each_segment_all(bi, sbio, iter_all) + page_len[j++] = bi->bv_len; if (!status) { for (j = vcnt; j-- ; ) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c033bfcb209e..7fde645d2e90 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -711,6 +711,8 @@ static bool is_full_stripe_write(struct stripe_head *sh) } static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __acquires(&sh1->stripe_lock) + __acquires(&sh2->stripe_lock) { if (sh1 > sh2) { spin_lock_irq(&sh2->stripe_lock); @@ -722,6 +724,8 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) } static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __releases(&sh1->stripe_lock) + __releases(&sh2->stripe_lock) { spin_unlock(&sh1->stripe_lock); spin_unlock_irq(&sh2->stripe_lock); @@ -4187,7 +4191,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, /* now write out any block on a failed drive, * or P or Q if they were recomputed */ - BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ + dev = NULL; if (s->failed == 2) { dev = &sh->dev[s->failed_num[1]]; s->locked++; @@ -4212,6 +4216,14 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } + if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), + "%s: disk%td not up to date\n", + mdname(conf->mddev), + dev - (struct r5dev *) &sh->dev)) { + clear_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_Wantwrite, &dev->flags); + s->locked--; + } clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); @@ -6166,6 +6178,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, static int handle_active_stripes(struct r5conf *conf, int group, struct r5worker *worker, struct list_head *temp_inactive_list) + __releases(&conf->device_lock) + __acquires(&conf->device_lock) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0, hash; |