19 files changed, 159 insertions, 78 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b59727309072..bfec3bdfe598 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -259,7 +259,7 @@ config DM_CRYPT
 	  the ciphers you're going to use in the cryptoapi configuration.
 
 	  For further information on dm-crypt and userspace tools see:
-	  <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
+	  <https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt>
 
 	  To compile this code as a module, choose M here: the module will
 	  be called dm-crypt.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ed2346ddf4c9..e51de52eeb94 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -494,7 +494,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
 
-	bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
+	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (bitmap->storage.sb_page == NULL)
 		return -ENOMEM;
 	bitmap->storage.sb_page->index = 0;
@@ -541,6 +541,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	sb->state = cpu_to_le32(bitmap->flags);
 	bitmap->events_cleared = bitmap->mddev->events;
 	sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+	bitmap->mddev->bitmap_info.nodes = 0;
 
 	kunmap_atomic(sb);
 
@@ -558,6 +559,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
+	loff_t offset = bitmap->mddev->bitmap_info.offset;
 
 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -584,9 +586,9 @@ re_read:
 		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
 		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
-		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+		offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
 		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
-			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+			bitmap->cluster_slot, offset);
 	}
 
 	if (bitmap->storage.file) {
@@ -597,7 +599,7 @@ re_read:
 				bitmap, bytes, sb_page);
 	} else {
 		err = read_sb_page(bitmap->mddev,
-				   bitmap->mddev->bitmap_info.offset,
+				   offset,
 				   sb_page,
 				   0, sizeof(bitmap_super_t));
 	}
@@ -611,8 +613,16 @@ re_read:
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
-	nodes = le32_to_cpu(sb->nodes);
-	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
+	/* XXX: This is a hack to ensure that we don't use clustering
+	 *  in case:
+	 *	- dm-raid is in use and
+	 *	- the nodes written in bitmap_sb is erroneous.
+	 */
+	if (!bitmap->mddev->sync_super) {
+		nodes = le32_to_cpu(sb->nodes);
+		strlcpy(bitmap->mddev->bitmap_info.cluster_name,
+				sb->cluster_name, 64);
+	}
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -671,7 +681,7 @@ out:
 	kunmap_atomic(sb);
 	/* Assiging chunksize is required for "re_read" */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
-	if (nodes && (bitmap->cluster_slot < 0)) {
+	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
 		if (err) {
 			pr_err("%s: Could not setup cluster service (%d)\n",
@@ -1866,10 +1876,6 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	if (IS_ERR(bitmap))
 		return PTR_ERR(bitmap);
 
-	rv = bitmap_read_sb(bitmap);
-	if (rv)
-		goto err;
-
 	rv = bitmap_init_from_disk(bitmap, 0);
 	if (rv)
 		goto err;
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 32814371b8d3..aa1b41ca40f7 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1471,5 +1471,3 @@ module_exit(mq_exit);
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("mq cache policy");
-
-MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index b6f22651dd35..200366c62231 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1686,7 +1686,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
 
 	if (from_cblock(cache_size)) {
 		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
-		if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+		if (!mq->cache_hit_bits) {
 			DMERR("couldn't allocate cache hit bitset");
 			goto bad_cache_hit_bits;
 		}
@@ -1789,3 +1789,5 @@ module_exit(smq_exit);
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("smq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b680da5d7b93..1fe93cfea7d3 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -424,6 +424,7 @@ static void free_migration(struct dm_cache_migration *mg)
 		wake_up(&cache->migration_wait);
 
 	mempool_free(mg, cache->migration_pool);
+	wake_worker(cache);
 }
 
 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -1966,6 +1967,7 @@ static void process_deferred_bios(struct cache *cache)
 		 * this bio might require one, we pause until there are some
 		 * prepared mappings to process.
 		 */
+		prealloc_used = true;
 		if (prealloc_data_structs(cache, &structs)) {
 			spin_lock_irqsave(&cache->lock, flags);
 			bio_list_merge(&cache->deferred_bios, &bios);
@@ -1981,7 +1983,6 @@ static void process_deferred_bios(struct cache *cache)
 			process_discard_bio(cache, &structs, bio);
 		else
 			process_bio(cache, &structs, bio);
-		prealloc_used = true;
 	}
 
 	if (prealloc_used)
@@ -2010,6 +2011,7 @@ static void process_deferred_cells(struct cache *cache)
 		 * this bio might require one, we pause until there are some
 		 * prepared mappings to process.
 		 */
+		prealloc_used = true;
 		if (prealloc_data_structs(cache, &structs)) {
 			spin_lock_irqsave(&cache->lock, flags);
 			list_splice(&cells, &cache->deferred_cells);
@@ -2018,7 +2020,6 @@ static void process_deferred_cells(struct cache *cache)
 		}
 
 		process_cell(cache, &structs, cell);
-		prealloc_used = true;
 	}
 
 	if (prealloc_used)
@@ -2080,6 +2081,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 		if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
 			break; /* no work to do */
 
+		prealloc_used = true;
 		if (prealloc_data_structs(cache, &structs) ||
 		    get_cell(cache, oblock, &structs, &old_ocell)) {
 			policy_set_dirty(cache->policy, oblock);
@@ -2087,7 +2089,6 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 		}
 
 		writeback(cache, &structs, oblock, cblock, old_ocell);
-		prealloc_used = true;
 	}
 
 	if (prealloc_used)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 48dfe3c4d6aa..6ba47cfb1443 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1293,8 +1293,8 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd)
 		return r;
 
 	disk_super = dm_block_data(copy);
-	dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
-	dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
+	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
 	dm_sm_dec_block(pmd->metadata_sm, held_root);
 
 	return dm_tm_unlock(pmd->tm, copy);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1c50c580215c..d2bbe8cc1e97 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -666,16 +666,21 @@ static void requeue_io(struct thin_c *tc)
 	requeue_deferred_cells(tc);
 }
 
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
 {
 	struct thin_c *tc;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tc, &pool->active_thins, list)
-		error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+		error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
 	rcu_read_unlock();
 }
 
+static void error_retry_list(struct pool *pool)
+{
+	return error_retry_list_with_code(pool, -EIO);
+}
+
 /*
  * This section of code contains the logic for processing a thin device's IO.
  * Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -2297,7 +2302,7 @@ static void do_no_space_timeout(struct work_struct *ws)
 	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
 		pool->pf.error_if_no_space = true;
 		notify_of_pool_mode_change_to_oods(pool);
-		error_retry_list(pool);
+		error_retry_list_with_code(pool, -ENOSPC);
 	}
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ab37ae114e94..0d7ab20c58df 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1729,7 +1729,8 @@ static int dm_merge_bvec(struct request_queue *q,
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_live_table_fast(md);
 	struct dm_target *ti;
-	sector_t max_sectors, max_size = 0;
+	sector_t max_sectors;
+	int max_size = 0;
 
 	if (unlikely(!map))
 		goto out;
@@ -1742,18 +1743,10 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * Find maximum amount of I/O that won't need splitting
 	 */
 	max_sectors = min(max_io_len(bvm->bi_sector, ti),
-			  (sector_t) queue_max_sectors(q));
+			  (sector_t) BIO_MAX_SECTORS);
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-
-	/*
-	 * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
-	 * to the targets' merge function since it holds sectors not bytes).
-	 * Just doing this as an interim fix for stable@ because the more
-	 * comprehensive cleanup of switching to sector_t will impact every
-	 * DM target that implements a ->merge hook.
-	 */
-	if (max_size > INT_MAX)
-		max_size = INT_MAX;
+	if (max_size < 0)
+		max_size = 0;
 
 	/*
 	 * merge_bvec_fn() returns number of bytes
@@ -1761,13 +1754,13 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * max is precomputed maximal io size
 	 */
 	if (max_size && ti->type->merge)
-		max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
+		max_size = ti->type->merge(ti, bvm, biovec, max_size);
 	/*
 	 * If the target doesn't support merge method and some of the devices
-	 * provided their merge_bvec method (we know this by looking for the
-	 * max_hw_sectors that dm_set_device_limits may set), then we can't
-	 * allow bios with multiple vector entries.  So always set max_size
-	 * to 0, and the code below allows just one page.
+	 * provided their merge_bvec method (we know this by looking at
+	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
+	 * entries.  So always set max_size to 0, and the code below allows
+	 * just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
 		max_size = 0;
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9b2672..0072190515e0 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -44,6 +44,7 @@ struct resync_info {
 
 /* md_cluster_info flags */
 #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
+#define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 
 
 struct md_cluster_info {
@@ -275,6 +276,9 @@ clear_bit:
 
 static void recover_prep(void *arg)
 {
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
 static void recover_slot(void *arg, struct dlm_slot *slot)
@@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,
 
 	cinfo->slot_number = our_slot;
 	complete(&cinfo->completion);
+	clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
 static const struct dlm_lockspace_ops md_ls_ops = {
@@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
 	resync_send(mddev, RESYNCING, 0, 0);
 }
 
-static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+static int area_resyncing(struct mddev *mddev, int direction,
+		sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	int ret = 0;
 	struct suspend_info *s;
 
+	if ((direction == READ) &&
+		test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
+		return 1;
+
 	spin_lock_irq(&cinfo->suspend_lock);
 	if (list_empty(&cinfo->suspend_list))
 		goto out;
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 6817ee00e053..00defe2badbc 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -18,7 +18,7 @@ struct md_cluster_operations {
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
-	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
 	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*add_new_disk_finish)(struct mddev *mddev);
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d429c30cd514..e25f00f0138a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5382,6 +5382,8 @@ static void __md_stop(struct mddev *mddev)
 {
 	struct md_personality *pers = mddev->pers;
 	mddev_detach(mddev);
+	/* Ensure ->event_work is done */
+	flush_workqueue(md_misc_wq);
 	spin_lock(&mddev->lock);
 	mddev->ready = 0;
 	mddev->pers = NULL;
@@ -5757,7 +5759,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
 	char *ptr;
 	int err;
 
-	file = kmalloc(sizeof(*file), GFP_NOIO);
+	file = kzalloc(sizeof(*file), GFP_NOIO);
 	if (!file)
 		return -ENOMEM;
 
@@ -7437,7 +7439,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
 	err = request_module("md-cluster");
 	if (err) {
 		pr_err("md-cluster module not found.\n");
-		return err;
+		return -ENOENT;
 	}
 
 	spin_lock(&pers_lock);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index bf2b80d5c470..8731b6ea026b 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key);
 
 extern struct dm_block_validator btree_node_validator;
 
+/*
+ * Value type for upper levels of multi-level btrees.
+ */
+extern void init_le64_type(struct dm_transaction_manager *tm,
+			   struct dm_btree_value_type *vt);
+
 #endif	/* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 9836c0ae897c..4222f774cf36 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -544,14 +544,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 	return r;
 }
 
-static struct dm_btree_value_type le64_type = {
-	.context = NULL,
-	.size = sizeof(__le64),
-	.inc = NULL,
-	.dec = NULL,
-	.equal = NULL
-};
-
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root)
 {
@@ -559,12 +551,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 	int index = 0, r = 0;
 	struct shadow_spine spine;
 	struct btree_node *n;
+	struct dm_btree_value_type le64_vt;
 
+	init_le64_type(info->tm, &le64_vt);
 	init_shadow_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
 		r = remove_raw(&spine, info,
 			       (level == last_level ?
-				&info->value_type : &le64_type),
+				&info->value_type : &le64_vt),
 			       root, keys[level], (unsigned *)&index);
 		if (r < 0)
 			break;
@@ -654,11 +648,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
 	int index = 0, r = 0;
 	struct shadow_spine spine;
 	struct btree_node *n;
+	struct dm_btree_value_type le64_vt;
 	uint64_t k;
 
+	init_le64_type(info->tm, &le64_vt);
 	init_shadow_spine(&spine, info);
 	for (level = 0; level < last_level; level++) {
-		r = remove_raw(&spine, info, &le64_type,
+		r = remove_raw(&spine, info, &le64_vt,
 			       root, keys[level], (unsigned *) &index);
 		if (r < 0)
 			goto out;
@@ -689,6 +685,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
 					     value_ptr(n, index));
 
 		delete_at(n, index);
+		keys[last_level] = k + 1ull;
 
 	} else
 		r = -ENODATA;
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index 1b5e13ec7f96..0dee514ba4c5 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -249,3 +249,40 @@ int shadow_root(struct shadow_spine *s)
 {
 	return s->root;
 }
+
+static void le64_inc(void *context, const void *value_le)
+{
+	struct dm_transaction_manager *tm = context;
+	__le64 v_le;
+
+	memcpy(&v_le, value_le, sizeof(v_le));
+	dm_tm_inc(tm, le64_to_cpu(v_le));
+}
+
+static void le64_dec(void *context, const void *value_le)
+{
+	struct dm_transaction_manager *tm = context;
+	__le64 v_le;
+
+	memcpy(&v_le, value_le, sizeof(v_le));
+	dm_tm_dec(tm, le64_to_cpu(v_le));
+}
+
+static int le64_equal(void *context, const void *value1_le, const void *value2_le)
+{
+	__le64 v1_le, v2_le;
+
+	memcpy(&v1_le, value1_le, sizeof(v1_le));
+	memcpy(&v2_le, value2_le, sizeof(v2_le));
+	return v1_le == v2_le;
+}
+
+void init_le64_type(struct dm_transaction_manager *tm,
+		    struct dm_btree_value_type *vt)
+{
+	vt->context = tm;
+	vt->size = sizeof(__le64);
+	vt->inc = le64_inc;
+	vt->dec = le64_dec;
+	vt->equal = le64_equal;
+}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index fdd3793e22f9..c7726cebc495 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -667,12 +667,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 	struct btree_node *n;
 	struct dm_btree_value_type le64_type;
 
-	le64_type.context = NULL;
-	le64_type.size = sizeof(__le64);
-	le64_type.inc = NULL;
-	le64_type.dec = NULL;
-	le64_type.equal = NULL;
-
+	init_le64_type(info->tm, &le64_type);
 	init_shadow_spine(&spine, info);
 
 	for (level = 0; level < (info->levels - 1); level++) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f80f1af61ce7..967a4ed73929 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -336,7 +336,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (r1_bio->mddev->degraded == conf->raid_disks ||
 		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
-		     !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+		     test_bit(In_sync, &conf->mirrors[mirror].rdev->flags)))
 			uptodate = 1;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 
 	if ((conf->mddev->recovery_cp < this_sector + sectors) ||
 	    (mddev_is_clustered(conf->mddev) &&
-	    md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+	    md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
 		    this_sector + sectors)))
 		choose_first = 1;
 	else
@@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	    ((bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
 	    (mddev_is_clustered(mddev) &&
-	     md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+	     md_cluster_ops->area_resyncing(mddev, WRITE,
+		     bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			if (bio_end_sector(bio) <= mddev->suspend_lo ||
 			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
 			    (mddev_is_clustered(mddev) &&
-			     !md_cluster_ops->area_resyncing(mddev,
+			     !md_cluster_ops->area_resyncing(mddev, WRITE,
 				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
 				break;
 			schedule();
@@ -1475,6 +1476,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r1conf *conf = mddev->private;
+	unsigned long flags;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1494,14 +1496,13 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 		return;
 	}
 	set_bit(Blocked, &rdev->flags);
+	spin_lock_irqsave(&conf->device_lock, flags);
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
 		set_bit(Faulty, &rdev->flags);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
 	} else
 		set_bit(Faulty, &rdev->flags);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 	/*
 	 * if recovery is running, make sure it aborts.
 	 */
@@ -1567,7 +1568,10 @@ static int raid1_spare_active(struct mddev *mddev)
 	 * Find all failed disks within the RAID1 configuration
 	 * and mark them readable.
 	 * Called under mddev lock, so rcu protection not needed.
+	 * device_lock used to avoid races with raid1_end_read_request
+	 * which expects 'In_sync' flags and ->degraded to be consistent.
 	 */
+	spin_lock_irqsave(&conf->device_lock, flags);
 	for (i = 0; i < conf->raid_disks; i++) {
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
 		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
@@ -1598,7 +1602,6 @@ static int raid1_spare_active(struct mddev *mddev)
 			sysfs_notify_dirent_safe(rdev->sysfs_state);
 		}
 	}
-	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded -= count;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 940f2f365461..38c58e19cfce 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3556,6 +3556,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 			/* far_copies must be 1 */
 			conf->prev.stride = conf->dev_sectors;
 	}
+	conf->reshape_safe = conf->reshape_progress;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 
@@ -3760,7 +3761,6 @@ static int run(struct mddev *mddev)
 		}
 		conf->offset_diff = min_offset_diff;
 
-		conf->reshape_safe = conf->reshape_progress;
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -4103,6 +4103,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 		conf->reshape_progress = size;
 	} else
 		conf->reshape_progress = 0;
+	conf->reshape_safe = conf->reshape_progress;
 	spin_unlock_irq(&conf->device_lock);
 
 	if (mddev->delta_disks && mddev->bitmap) {
@@ -4170,6 +4171,7 @@ abort:
 		rdev->new_data_offset = rdev->data_offset;
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
+	conf->reshape_safe = MaxSector;
 	mddev->reshape_position = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 	return ret;
@@ -4524,6 +4526,7 @@ static void end_reshape(struct r10conf *conf)
 	md_finish_reshape(conf->mddev);
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
+	conf->reshape_safe = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* read-ahead size must cover two whole stripes, which is
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59e44e99eef3..f757023fc458 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2162,6 +2162,9 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	if (!sc)
 		return -ENOMEM;
 
+	/* Need to ensure auto-resizing doesn't interfere */
+	mutex_lock(&conf->cache_size_mutex);
+
 	for (i = conf->max_nr_stripes; i; i--) {
 		nsh = alloc_stripe(sc, GFP_KERNEL);
 		if (!nsh)
@@ -2178,6 +2181,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			kmem_cache_free(sc, nsh);
 		}
 		kmem_cache_destroy(sc);
+		mutex_unlock(&conf->cache_size_mutex);
 		return -ENOMEM;
 	}
 	/* Step 2 - Must use GFP_NOIO now.
@@ -2224,6 +2228,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	} else
 		err = -ENOMEM;
 
+	mutex_unlock(&conf->cache_size_mutex);
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2251,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 static int drop_one_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
-	int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+	int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
 
 	spin_lock_irq(conf->hash_locks + hash);
 	sh = get_free_stripe(conf, hash);
@@ -4061,8 +4066,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 				 &first_bad, &bad_sectors))
 			set_bit(R5_ReadRepl, &dev->flags);
 		else {
-			if (rdev)
+			if (rdev && !test_bit(Faulty, &rdev->flags))
 				set_bit(R5_NeedReplace, &dev->flags);
+			else
+				clear_bit(R5_NeedReplace, &dev->flags);
 			rdev = rcu_dereference(conf->disks[i].rdev);
 			clear_bit(R5_ReadRepl, &dev->flags);
 		}
@@ -5857,12 +5864,14 @@ static void raid5d(struct md_thread *thread)
 	pr_debug("%d stripes handled\n", handled);
 
 	spin_unlock_irq(&conf->device_lock);
-	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
+	    mutex_trylock(&conf->cache_size_mutex)) {
 		grow_one_stripe(conf, __GFP_NOWARN);
 		/* Set flag even if allocation failed.  This helps
 		 * slow down allocation requests when mem is short
 		 */
 		set_bit(R5_DID_ALLOC, &conf->cache_state);
+		mutex_unlock(&conf->cache_size_mutex);
 	}
 
 	async_tx_issue_pending_all();
@@ -5894,18 +5903,22 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 		return -EINVAL;
 
 	conf->min_nr_stripes = size;
+	mutex_lock(&conf->cache_size_mutex);
 	while (size < conf->max_nr_stripes &&
 	       drop_one_stripe(conf))
 		;
+	mutex_unlock(&conf->cache_size_mutex);
 
 
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
 
+	mutex_lock(&conf->cache_size_mutex);
 	while (size > conf->max_nr_stripes)
 		if (!grow_one_stripe(conf, GFP_KERNEL))
 			break;
+	mutex_unlock(&conf->cache_size_mutex);
 
 	return 0;
 }
@@ -6371,11 +6384,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
 				      struct shrink_control *sc)
 {
 	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
-	int ret = 0;
-	while (ret < sc->nr_to_scan) {
-		if (drop_one_stripe(conf) == 0)
-			return SHRINK_STOP;
-		ret++;
+	unsigned long ret = SHRINK_STOP;
+
+	if (mutex_trylock(&conf->cache_size_mutex)) {
+		ret= 0;
+		while (ret < sc->nr_to_scan &&
+		       conf->max_nr_stripes > conf->min_nr_stripes) {
+			if (drop_one_stripe(conf) == 0) {
+				ret = SHRINK_STOP;
+				break;
+			}
+			ret++;
+		}
+		mutex_unlock(&conf->cache_size_mutex);
 	}
 	return ret;
 }
@@ -6444,6 +6465,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		goto abort;
 	spin_lock_init(&conf->device_lock);
 	seqcount_init(&conf->gen_lock);
+	mutex_init(&conf->cache_size_mutex);
 	init_waitqueue_head(&conf->wait_for_quiescent);
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
 		init_waitqueue_head(&conf->wait_for_stripe[i]);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 02c3bf8fbfe7..d05144278690 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -482,7 +482,8 @@ struct r5conf {
 	 */
 	int			active_name;
 	char			cache_name[2][32];
-	struct kmem_cache		*slab_cache; /* for allocating stripes */
+	struct kmem_cache	*slab_cache; /* for allocating stripes */
+	struct mutex		cache_size_mutex; /* Protect changes to cache size */
 
 	int			seq_flush, seq_write;
 	int			quiesce;