summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/btree.c40
-rw-r--r--drivers/md/bcache/btree.h3
-rw-r--r--drivers/md/bcache/request.c4
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/dm-cache-target.c6
-rw-r--r--drivers/md/dm-crypt.c7
-rw-r--r--drivers/md/dm-flakey.c2
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm.c55
-rw-r--r--drivers/md/linear.c39
-rw-r--r--drivers/md/linear.h1
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c14
-rw-r--r--drivers/md/raid10.c22
-rw-r--r--drivers/md/raid5.c9
16 files changed, 163 insertions, 48 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6b420a55c745..c3ea03c9a1a8 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -425,7 +425,7 @@ struct cache {
* until a gc finishes - otherwise we could pointlessly burn a ton of
* cpu
*/
- unsigned invalidate_needs_gc:1;
+ unsigned invalidate_needs_gc;
bool discard; /* Get rid of? */
@@ -593,8 +593,8 @@ struct cache_set {
/* Counts how many sectors bio_insert has added to the cache */
atomic_t sectors_to_gc;
+ wait_queue_head_t gc_wait;
- wait_queue_head_t moving_gc_wait;
struct keybuf moving_gc_keys;
/* Number of moving GC bios in flight */
struct semaphore moving_in_flight;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 22b9e34ceb75..5b815e64c1c9 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1762,33 +1762,34 @@ static void bch_btree_gc(struct cache_set *c)
bch_moving_gc(c);
}
-static int bch_gc_thread(void *arg)
+static bool gc_should_run(struct cache_set *c)
{
- struct cache_set *c = arg;
struct cache *ca;
unsigned i;
- while (1) {
-again:
- bch_btree_gc(c);
+ for_each_cache(ca, c, i)
+ if (ca->invalidate_needs_gc)
+ return true;
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
+ if (atomic_read(&c->sectors_to_gc) < 0)
+ return true;
- mutex_lock(&c->bucket_lock);
+ return false;
+}
- for_each_cache(ca, c, i)
- if (ca->invalidate_needs_gc) {
- mutex_unlock(&c->bucket_lock);
- set_current_state(TASK_RUNNING);
- goto again;
- }
+static int bch_gc_thread(void *arg)
+{
+ struct cache_set *c = arg;
- mutex_unlock(&c->bucket_lock);
+ while (1) {
+ wait_event_interruptible(c->gc_wait,
+ kthread_should_stop() || gc_should_run(c));
- try_to_freeze();
- schedule();
+ if (kthread_should_stop())
+ break;
+
+ set_gc_sectors(c);
+ bch_btree_gc(c);
}
return 0;
@@ -1796,11 +1797,10 @@ again:
int bch_gc_thread_start(struct cache_set *c)
{
- c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+ c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
if (IS_ERR(c->gc_thread))
return PTR_ERR(c->gc_thread);
- set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
return 0;
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 5c391fa01bed..9b80417cd547 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
static inline void wake_up_gc(struct cache_set *c)
{
- if (c->gc_thread)
- wake_up_process(c->gc_thread);
+ wake_up(&c->gc_wait);
}
#define MAP_DONE 0
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 25fa8445bb24..2410df1c2a05 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,10 +196,8 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
- if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
- set_gc_sectors(op->c);
+ if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
wake_up_gc(op->c);
- }
if (op->bypass)
return bch_data_invalidate(cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3d5c0ba13181..7b5880b8874c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1489,6 +1489,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
init_waitqueue_head(&c->bucket_wait);
+ init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
spin_lock_init(&c->btree_gc_time.lock);
@@ -1547,6 +1548,7 @@ static void run_cache_set(struct cache_set *c)
for_each_cache(ca, c, i)
c->nbuckets += ca->sb.nbuckets;
+ set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
LIST_HEAD(journal);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 515f83e7d9ab..b59615ddf6ba 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -251,7 +251,7 @@ struct cache {
/*
* Fields for converting from sectors to blocks.
*/
- uint32_t sectors_per_block;
+ sector_t sectors_per_block;
int sectors_per_block_shift;
spinlock_t lock;
@@ -3547,11 +3547,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+ DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
- cache->sectors_per_block,
+ (unsigned long long)cache->sectors_per_block,
(unsigned long long) from_cblock(residency),
(unsigned long long) from_cblock(cache->cache_size),
(unsigned) atomic_read(&cache->stats.read_hit),
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5cac11d7a876..de628883ee3d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1500,12 +1500,15 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
if (!cc->key_size && strcmp(key, "-"))
goto out;
+ /* clear the flag since following operations may invalidate previously valid key */
+ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+
if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
goto out;
- set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-
r = crypt_setkey_allcpus(cc);
+ if (!r)
+ set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
out:
/* Hex key string not needed after here, so wipe it. */
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 8e9e928dafba..78f403b45ab3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -200,11 +200,13 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (!(fc->up_interval + fc->down_interval)) {
ti->error = "Total (up + down) interval is zero";
+ r = -EINVAL;
goto bad;
}
if (fc->up_interval + fc->down_interval < fc->up_interval) {
ti->error = "Interval overflow";
+ r = -EINVAL;
goto bad;
}
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 8289804ccd99..d5ea9f28ae70 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head)
int cpu;
struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+ kfree(s->histogram_boundaries);
kfree(s->program_id);
kfree(s->aux_data);
for_each_possible_cpu(cpu) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3384a3eef917..397f0454100b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1467,11 +1467,62 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+/*
+ * Flush current->bio_list when the target map method blocks.
+ * This fixes deadlocks in snapshot and possibly in other targets.
+ */
+struct dm_offload {
+ struct blk_plug plug;
+ struct blk_plug_cb cb;
+};
+
+static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct dm_offload *o = container_of(cb, struct dm_offload, cb);
+ struct bio_list list;
+ struct bio *bio;
+
+ INIT_LIST_HEAD(&o->cb.list);
+
+ if (unlikely(!current->bio_list))
+ return;
+
+ list = *current->bio_list;
+ bio_list_init(current->bio_list);
+
+ while ((bio = bio_list_pop(&list))) {
+ struct bio_set *bs = bio->bi_pool;
+ if (unlikely(!bs) || bs == fs_bio_set) {
+ bio_list_add(current->bio_list, bio);
+ continue;
+ }
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_add(&bs->rescue_list, bio);
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ spin_unlock(&bs->rescue_lock);
+ }
+}
+
+static void dm_offload_start(struct dm_offload *o)
+{
+ blk_start_plug(&o->plug);
+ o->cb.callback = flush_current_bio_list;
+ list_add(&o->cb.list, &current->plug->cb_list);
+}
+
+static void dm_offload_end(struct dm_offload *o)
+{
+ list_del(&o->cb.list);
+ blk_finish_plug(&o->plug);
+}
+
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
struct mapped_device *md;
+ struct dm_offload o;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
@@ -1484,7 +1535,11 @@ static void __map_bio(struct dm_target_io *tio)
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_iter.bi_sector;
+
+ dm_offload_start(&o);
r = ti->type->map(ti, clone);
+ dm_offload_end(&o);
+
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b7fe7e9fc777..6ba3227e29b2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -52,18 +52,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
return conf->disks + lo;
}
+/*
+ * In linear_congested() conf->raid_disks is used as a copy of
+ * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks
+ * and conf->disks[] are created in linear_conf(), they are always
+ * consitent with each other, but mddev->raid_disks does not.
+ */
static int linear_congested(struct mddev *mddev, int bits)
{
struct linear_conf *conf;
int i, ret = 0;
- conf = mddev->private;
+ rcu_read_lock();
+ conf = rcu_dereference(mddev->private);
- for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ for (i = 0; i < conf->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
+ rcu_read_unlock();
return ret;
}
@@ -143,6 +151,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
return conf;
out:
@@ -195,15 +216,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
mddev_suspend(mddev);
- oldconf = mddev->private;
+ oldconf = rcu_dereference(mddev->private);
mddev->raid_disks++;
- mddev->private = newconf;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree(oldconf);
+ kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index b685ddd7d7f7..8d392e6098b3 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,6 +10,7 @@ struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
+ int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[0];
};
#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c1c7d4fb4b77..eff554a12fb4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6771,7 +6771,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait,
!test_bit(MD_RECOVERY_NEEDED,
- &mddev->flags),
+ &mddev->recovery),
msecs_to_jiffies(5000));
if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
/* Need to flush page cache, and ensure no-one else opens
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 7e44005595c1..20557e2c60c6 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -775,17 +775,15 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
r = sm_ll_new_metadata(&smm->ll, tm);
+ if (!r) {
+ if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+ nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
+ r = sm_ll_extend(&smm->ll, nr_blocks);
+ }
+ memcpy(&smm->sm, &ops, sizeof(smm->sm));
if (r)
return r;
- if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
- nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
- r = sm_ll_extend(&smm->ll, nr_blocks);
- if (r)
- return r;
-
- memcpy(&smm->sm, &ops, sizeof(smm->sm));
-
/*
* Now we need to update the newly created data structures with the
* allocated blocks that they were built from.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ebb0dd612ebd..a92979e704e3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1072,6 +1072,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
int max_sectors;
int sectors;
+ md_write_start(mddev, bio);
+
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1455,8 +1457,6 @@ static void make_request(struct mddev *mddev, struct bio *bio)
return;
}
- md_write_start(mddev, bio);
-
do {
/*
@@ -1477,7 +1477,25 @@ static void make_request(struct mddev *mddev, struct bio *bio)
split = bio;
}
+ /*
+ * If a bio is splitted, the first part of bio will pass
+ * barrier but the bio is queued in current->bio_list (see
+ * generic_make_request). If there is a raise_barrier() called
+ * here, the second part of bio can't pass barrier. But since
+ * the first part bio isn't dispatched to underlaying disks
+ * yet, the barrier is never released, hence raise_barrier will
+ * alays wait. We have a deadlock.
+ * Note, this only happens in read path. For write path, the
+ * first part of bio is dispatched in a schedule() call
+ * (because of blk plug) or offloaded to raid10d.
+ * Quitting from the function immediately can change the bio
+ * order queued in bio_list and avoid the deadlock.
+ */
__make_request(mddev, split);
+ if (split != bio && bio_data_dir(bio) == READ) {
+ generic_make_request(bio);
+ break;
+ }
} while (split != bio);
/* In case raid10d snuck in to freeze_array */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 10ce885445f6..7af976934441 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6980,6 +6980,15 @@ static int run(struct mddev *mddev)
stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
+
+ /*
+ * We use 16-bit counter of active stripes in bi_phys_segments
+ * (minus one for over-loaded initialization)
+ */
+ blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+ blk_queue_max_discard_sectors(mddev->queue,
+ 0xfffe * STRIPE_SECTORS);
+
/*
* unaligned part of discard request will be ignored, so can't
* guarantee discard_zeroes_data