From 4e420c452b11edf9d510c8180ac66f529e5b6206 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 13:48:51 +0100
Subject: dm bufio: switch from a huge hash table to an rbtree

Converting over to using an rbtree eliminates a fixed 8MB allocation
from vmalloc space for the hash table.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 97 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 43 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0be200b6dbf2..dcaa1d9dfbe4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/shrinker.h>
 #include <linux/module.h>
+#include <linux/rbtree.h>
 
 #define DM_MSG_PREFIX "bufio"
 
@@ -47,14 +48,6 @@
  */
 #define DM_BUFIO_INLINE_VECS		16
 
-/*
- * Buffer hash
- */
-#define DM_BUFIO_HASH_BITS	20
-#define DM_BUFIO_HASH(block) \
-	((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
-	 ((1 << DM_BUFIO_HASH_BITS) - 1))
-
 /*
  * Don't try to use kmem_cache_alloc for blocks larger than this.
  * For explanation, see alloc_buffer_data below.
@@ -106,7 +99,7 @@ struct dm_bufio_client {
 
 	unsigned minimum_buffers;
 
-	struct hlist_head *cache_hash;
+	struct rb_root buffer_tree;
 	wait_queue_head_t free_buffer_wait;
 
 	int async_write_error;
@@ -135,7 +128,7 @@ enum data_mode {
 };
 
 struct dm_buffer {
-	struct hlist_node hash_list;
+	struct rb_node node;
 	struct list_head lru_list;
 	sector_t block;
 	void *data;
@@ -253,6 +246,53 @@ static LIST_HEAD(dm_bufio_all_clients);
  */
 static DEFINE_MUTEX(dm_bufio_clients_lock);
 
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+	struct rb_node *n = c->buffer_tree.rb_node;
+	struct dm_buffer *b;
+
+	while (n) {
+		b = container_of(n, struct dm_buffer, node);
+
+		if (b->block == block)
+			return b;
+
+		n = (b->block < block) ? n->rb_left : n->rb_right;
+	}
+
+	return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+	struct dm_buffer *found;
+
+	while (*new) {
+		found = container_of(*new, struct dm_buffer, node);
+
+		if (found->block == b->block) {
+			BUG_ON(found != b);
+			return;
+		}
+
+		parent = *new;
+		new = (found->block < b->block) ?
+			&((*new)->rb_left) : &((*new)->rb_right);
+	}
+
+	rb_link_node(&b->node, parent, new);
+	rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+	rb_erase(&b->node, &c->buffer_tree);
+}
+
 /*----------------------------------------------------------------*/
 
 static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +474,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 	b->block = block;
 	b->list_mode = dirty;
 	list_add(&b->lru_list, &c->lru[dirty]);
-	hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+	__insert(b->c, b);
 	b->last_accessed = jiffies;
 }
 
@@ -448,7 +488,7 @@ static void __unlink_buffer(struct dm_buffer *b)
 	BUG_ON(!c->n_buffers[b->list_mode]);
 
 	c->n_buffers[b->list_mode]--;
-	hlist_del(&b->hash_list);
+	__remove(b->c, b);
 	list_del(&b->lru_list);
 }
 
@@ -888,23 +928,6 @@ static void __check_watermark(struct dm_bufio_client *c,
 		__write_dirty_buffers_async(c, 1, write_list);
 }
 
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
-	struct dm_buffer *b;
-
-	hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
-			     hash_list) {
-		dm_bufio_cond_resched();
-		if (b->block == block)
-			return b;
-	}
-
-	return NULL;
-}
-
 /*----------------------------------------------------------------
  * Getting a buffer
  *--------------------------------------------------------------*/
@@ -1534,11 +1557,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
-	if (!c->cache_hash) {
-		r = -ENOMEM;
-		goto bad_hash;
-	}
+	c->buffer_tree = RB_ROOT;
 
 	c->bdev = bdev;
 	c->block_size = block_size;
@@ -1557,9 +1576,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		c->n_buffers[i] = 0;
 	}
 
-	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-		INIT_HLIST_HEAD(&c->cache_hash[i]);
-
 	mutex_init(&c->lock);
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
@@ -1633,8 +1649,6 @@ bad_cache:
 	}
 	dm_io_client_destroy(c->dm_io);
 bad_dm_io:
-	vfree(c->cache_hash);
-bad_hash:
 	kfree(c);
 bad_client:
 	return ERR_PTR(r);
@@ -1661,9 +1675,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
-		BUG_ON(!hlist_empty(&c->cache_hash[i]));
-
+	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
 	BUG_ON(c->need_reserved_buffers);
 
 	while (!list_empty(&c->reserved_buffers)) {
@@ -1681,7 +1693,6 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 		BUG_ON(c->n_buffers[i]);
 
 	dm_io_client_destroy(c->dm_io);
-	vfree(c->cache_hash);
 	kfree(c);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-- 
cgit v1.2.3


From 33096a7822de63bc7dbdd090870b656a0304fa35 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 9 Oct 2014 11:10:25 +0100
Subject: dm bufio: evict buffers that are past the max age but retain some
 buffers

These changes help keep metadata backed by dm-bufio in-core longer which
fixes reports of metadata churn in the face of heavy random IO workloads.

Before, bufio evicted all buffers older than DM_BUFIO_DEFAULT_AGE_SECS.
Having a device (e.g. dm-thinp or dm-cache) lose all metadata just
because associated buffers had been idle for some time is unfriendly.

Now, the user may now configure the number of bytes that bufio retains
using the 'retain_bytes' module parameter.  The default is 256K.

Also, the DM_BUFIO_WORK_TIMER_SECS and DM_BUFIO_DEFAULT_AGE_SECS
defaults were quite low so increase them (to 30 and 300 respectively).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 109 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 34 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index dcaa1d9dfbe4..99579c81ae0a 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -35,12 +35,17 @@
 /*
  * Check buffer ages in this interval (seconds)
  */
-#define DM_BUFIO_WORK_TIMER_SECS	10
+#define DM_BUFIO_WORK_TIMER_SECS	30
 
 /*
  * Free buffers when they are older than this (seconds)
  */
-#define DM_BUFIO_DEFAULT_AGE_SECS	60
+#define DM_BUFIO_DEFAULT_AGE_SECS	300
+
+/*
+ * The nr of bytes of cached data to keep around.
+ */
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
 
 /*
  * The number of bvec entries that are embedded directly in the buffer.
@@ -216,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
  * Buffers are freed after this timeout
  */
 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
 
 static unsigned long dm_bufio_peak_allocated;
 static unsigned long dm_bufio_allocated_kmem_cache;
@@ -1457,45 +1463,52 @@ static void drop_buffers(struct dm_bufio_client *c)
 }
 
 /*
- * Test if the buffer is unused and too old, and commit it.
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it.  Caller is expected to know buffer is too old.
+ *
  * And if GFP_NOFS is used, we must not do any I/O because we hold
  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
  * rerouted to different bufio client.
  */
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
-				unsigned long max_jiffies)
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 {
-	if (jiffies - b->last_accessed < max_jiffies)
-		return 0;
-
 	if (!(gfp & __GFP_FS)) {
 		if (test_bit(B_READING, &b->state) ||
 		    test_bit(B_WRITING, &b->state) ||
 		    test_bit(B_DIRTY, &b->state))
-			return 0;
+			return false;
 	}
 
 	if (b->hold_count)
-		return 0;
+		return false;
 
 	__make_buffer_clean(b);
 	__unlink_buffer(b);
 	__free_buffer_wake(b);
 
-	return 1;
+	return true;
 }
 
-static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-		   gfp_t gfp_mask)
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+        unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+        return retain_bytes / c->block_size;
+}
+
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+			    gfp_t gfp_mask)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
-	long freed = 0;
+	unsigned long freed = 0;
+	unsigned long count = nr_to_scan;
+	unsigned retain_target = get_retain_buffers(c);
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
-			freed += __cleanup_old_buffer(b, gfp_mask, 0);
-			if (!--nr_to_scan)
+			if (__try_evict_buffer(b, gfp_mask))
+				freed++;
+			if (!--nr_to_scan || ((count - freed) <= retain_target))
 				return freed;
 			dm_bufio_cond_resched();
 		}
@@ -1697,31 +1710,56 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
 
-static void cleanup_old_buffers(void)
+static unsigned get_max_age_hz(void)
 {
-	unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
-	struct dm_bufio_client *c;
+	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
 
-	if (max_age > ULONG_MAX / HZ)
-		max_age = ULONG_MAX / HZ;
+	if (max_age > UINT_MAX / HZ)
+		max_age = UINT_MAX / HZ;
 
-	mutex_lock(&dm_bufio_clients_lock);
-	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
-		if (!dm_bufio_trylock(c))
-			continue;
+	return max_age * HZ;
+}
 
-		while (!list_empty(&c->lru[LIST_CLEAN])) {
-			struct dm_buffer *b;
-			b = list_entry(c->lru[LIST_CLEAN].prev,
-				       struct dm_buffer, lru_list);
-			if (!__cleanup_old_buffer(b, 0, max_age * HZ))
-				break;
-			dm_bufio_cond_resched();
-		}
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+	return (jiffies - b->last_accessed) >= age_hz;
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+	struct dm_buffer *b, *tmp;
+	unsigned retain_target = get_retain_buffers(c);
+	unsigned count;
+
+	dm_bufio_lock(c);
+
+	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+		if (count <= retain_target)
+			break;
+
+		if (!older_than(b, age_hz))
+			break;
+
+		if (__try_evict_buffer(b, 0))
+			count--;
 
-		dm_bufio_unlock(c);
 		dm_bufio_cond_resched();
 	}
+
+	dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+	unsigned long max_age_hz = get_max_age_hz();
+	struct dm_bufio_client *c;
+
+	mutex_lock(&dm_bufio_clients_lock);
+
+	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+		__evict_old_buffers(c, max_age_hz);
+
 	mutex_unlock(&dm_bufio_clients_lock);
 }
 
@@ -1846,6 +1884,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
 
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
 
-- 
cgit v1.2.3


From a195db2d29a47c2c3a61386009bd400df18c86cf Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 16:30:06 -0400
Subject: dm bio prison: switch to using a red black tree

Previously it was using a fixed sized hash table.  There are times
when very many concurrent cells are held (such as when processing a very
large discard).  When this happens the hash table performance becomes
very poor.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c   | 172 ++++++++++++++++++-------------------------
 drivers/md/dm-bio-prison.h   |   7 +-
 drivers/md/dm-cache-target.c |   3 +-
 drivers/md/dm-thin.c         |   3 +-
 4 files changed, 79 insertions(+), 106 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..90a56625245a 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
 
 /*----------------------------------------------------------------*/
 
-struct bucket {
-	spinlock_t lock;
-	struct hlist_head cells;
-};
+#define MIN_CELLS 1024
 
 struct dm_bio_prison {
+	spinlock_t lock;
 	mempool_t *cell_pool;
-
-	unsigned nr_buckets;
-	unsigned hash_mask;
-	struct bucket *buckets;
+	struct rb_root cells;
 };
 
-/*----------------------------------------------------------------*/
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
-	uint32_t n = 128;
-
-	nr_cells /= 4;
-	nr_cells = min(nr_cells, 8192u);
-
-	while (n < nr_cells)
-		n <<= 1;
-
-	return n;
-}
-
 static struct kmem_cache *_cell_cache;
 
-static void init_bucket(struct bucket *b)
-{
-	spin_lock_init(&b->lock);
-	INIT_HLIST_HEAD(&b->cells);
-}
+/*----------------------------------------------------------------*/
 
 /*
  * @nr_cells should be the number of cells you want in use _concurrently_.
  * Don't confuse it with the number of distinct keys.
  */
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+struct dm_bio_prison *dm_bio_prison_create(void)
 {
-	unsigned i;
-	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
-	size_t len = sizeof(struct dm_bio_prison) +
-		(sizeof(struct bucket) * nr_buckets);
-	struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+	struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
 
 	if (!prison)
 		return NULL;
 
-	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+	spin_lock_init(&prison->lock);
+
+	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
 	if (!prison->cell_pool) {
 		kfree(prison);
 		return NULL;
 	}
 
-	prison->nr_buckets = nr_buckets;
-	prison->hash_mask = nr_buckets - 1;
-	prison->buckets = (struct bucket *) (prison + 1);
-	for (i = 0; i < nr_buckets; i++)
-		init_bucket(prison->buckets + i);
+	prison->cells = RB_ROOT;
 
 	return prison;
 }
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
 
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+static void __setup_new_cell(struct dm_cell_key *key,
+			     struct bio *holder,
+			     struct dm_bio_prison_cell *cell)
 {
-	const unsigned long BIG_PRIME = 4294967291UL;
-	uint64_t hash = key->block * BIG_PRIME;
-
-	return (uint32_t) (hash & prison->hash_mask);
+       memcpy(&cell->key, key, sizeof(cell->key));
+       cell->holder = holder;
+       bio_list_init(&cell->bios);
 }
 
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+static int cmp_keys(struct dm_cell_key *lhs,
+		    struct dm_cell_key *rhs)
 {
-	       return (lhs->virtual == rhs->virtual) &&
-		       (lhs->dev == rhs->dev) &&
-		       (lhs->block == rhs->block);
-}
+	if (lhs->virtual < rhs->virtual)
+		return -1;
 
-static struct bucket *get_bucket(struct dm_bio_prison *prison,
-				 struct dm_cell_key *key)
-{
-	return prison->buckets + hash_key(prison, key);
-}
+	if (lhs->virtual > rhs->virtual)
+		return 1;
 
-static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
-						  struct dm_cell_key *key)
-{
-	struct dm_bio_prison_cell *cell;
+	if (lhs->dev < rhs->dev)
+		return -1;
 
-	hlist_for_each_entry(cell, &b->cells, list)
-		if (keys_equal(&cell->key, key))
-			return cell;
+	if (lhs->dev > rhs->dev)
+		return 1;
 
-	return NULL;
-}
+	if (lhs->block < rhs->block)
+		return -1;
 
-static void __setup_new_cell(struct bucket *b,
-			     struct dm_cell_key *key,
-			     struct bio *holder,
-			     struct dm_bio_prison_cell *cell)
-{
-	memcpy(&cell->key, key, sizeof(cell->key));
-	cell->holder = holder;
-	bio_list_init(&cell->bios);
-	hlist_add_head(&cell->list, &b->cells);
+	if (lhs->block > rhs->block)
+		return 1;
+
+	return 0;
 }
 
-static int __bio_detain(struct bucket *b,
+static int __bio_detain(struct dm_bio_prison *prison,
 			struct dm_cell_key *key,
 			struct bio *inmate,
 			struct dm_bio_prison_cell *cell_prealloc,
 			struct dm_bio_prison_cell **cell_result)
 {
-	struct dm_bio_prison_cell *cell;
-
-	cell = __search_bucket(b, key);
-	if (cell) {
-		if (inmate)
-			bio_list_add(&cell->bios, inmate);
-		*cell_result = cell;
-		return 1;
+	int r;
+	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+	while (*new) {
+		struct dm_bio_prison_cell *cell =
+			container_of(*new, struct dm_bio_prison_cell, node);
+
+		r = cmp_keys(key, &cell->key);
+
+		parent = *new;
+		if (r < 0)
+			new = &((*new)->rb_left);
+		else if (r > 0)
+			new = &((*new)->rb_right);
+		else {
+			if (inmate)
+				bio_list_add(&cell->bios, inmate);
+			*cell_result = cell;
+			return 1;
+		}
 	}
 
-	__setup_new_cell(b, key, inmate, cell_prealloc);
+	__setup_new_cell(key, inmate, cell_prealloc);
 	*cell_result = cell_prealloc;
+
+	rb_link_node(&cell_prealloc->node, parent, new);
+	rb_insert_color(&cell_prealloc->node, &prison->cells);
+
 	return 0;
 }
 
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
 {
 	int r;
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
 
 	return r;
 }
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct dm_bio_prison_cell *cell,
+static void __cell_release(struct dm_bio_prison *prison,
+			   struct dm_bio_prison_cell *cell,
 			   struct bio_list *inmates)
 {
-	hlist_del(&cell->list);
+	rb_erase(&cell->node, &prison->cells);
 
 	if (inmates) {
 		if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
 		     struct bio_list *bios)
 {
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	__cell_release(cell, bios);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release(prison, cell, bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release);
 
 /*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+				     struct dm_bio_prison_cell *cell,
 				     struct bio_list *inmates)
 {
-	hlist_del(&cell->list);
+	rb_erase(&cell->node, &prison->cells);
 	bio_list_merge(inmates, &cell->bios);
 }
 
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct bio_list *inmates)
 {
 	unsigned long flags;
-	struct bucket *b = get_bucket(prison, &cell->key);
 
-	spin_lock_irqsave(&b->lock, flags);
-	__cell_release_no_holder(cell, inmates);
-	spin_unlock_irqrestore(&b->lock, flags);
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release_no_holder(prison, cell, inmates);
+	spin_unlock_irqrestore(&prison->lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..997a43960e77 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
 #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
 #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
 
-#include <linux/list.h>
 #include <linux/bio.h>
+#include <linux/rbtree.h>
 
 /*----------------------------------------------------------------*/
 
@@ -35,13 +35,14 @@ struct dm_cell_key {
  * themselves.
  */
 struct dm_bio_prison_cell {
-	struct hlist_node list;
+	struct rb_node node;
+
 	struct dm_cell_key key;
 	struct bio *holder;
 	struct bio_list bios;
 };
 
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+struct dm_bio_prison *dm_bio_prison_create(void);
 void dm_bio_prison_destroy(struct dm_bio_prison *prison);
 
 /*
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7130505c2425..69de8b43ca12 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 
 /*----------------------------------------------------------------*/
 
-#define PRISON_CELLS 1024
 #define MIGRATION_POOL_SIZE 128
 #define COMMIT_PERIOD HZ
 #define MIGRATION_COUNT_WINDOW 10
@@ -2327,7 +2326,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	INIT_DELAYED_WORK(&cache->waker, do_waker);
 	cache->last_commit_jiffies = jiffies;
 
-	cache->prison = dm_bio_prison_create(PRISON_CELLS);
+	cache->prison = dm_bio_prison_create();
 	if (!cache->prison) {
 		*error = "could not create bio prison";
 		goto bad;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 0f86d802b533..eecfe7495232 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -25,7 +25,6 @@
  */
 #define ENDIO_HOOK_POOL_SIZE 1024
 #define MAPPING_POOL_SIZE 1024
-#define PRISON_CELLS 1024
 #define COMMIT_PERIOD HZ
 #define NO_SPACE_TIMEOUT_SECS 60
 
@@ -2193,7 +2192,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		pool->sectors_per_block_shift = __ffs(block_size);
 	pool->low_water_blocks = 0;
 	pool_features_init(&pool->pf);
-	pool->prison = dm_bio_prison_create(PRISON_CELLS);
+	pool->prison = dm_bio_prison_create();
 	if (!pool->prison) {
 		*error = "Error creating pool's bio prison";
 		err_p = ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From e5cfc69a513cdc9d9e753c5ce07f0cc6b496bfd3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:24:55 +0100
Subject: dm thin metadata: change dm_thin_find_block to allow blocking, but
 not issuing, IO

This change is a prerequisite for allowing metadata to be prefetched.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 30 +++++++++++++-----------------
 drivers/md/dm-thin-metadata.h |  4 ++--
 2 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index e9d33ad59df5..ee42d1c52387 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 }
 
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-		       int can_block, struct dm_thin_lookup_result *result)
+		       int can_issue_io, struct dm_thin_lookup_result *result)
 {
-	int r = -EINVAL;
-	uint64_t block_time = 0;
+	int r;
 	__le64 value;
 	struct dm_pool_metadata *pmd = td->pmd;
 	dm_block_t keys[2] = { td->id, block };
 	struct dm_btree_info *info;
 
-	if (can_block) {
-		down_read(&pmd->root_lock);
-		info = &pmd->info;
-	} else if (down_read_trylock(&pmd->root_lock))
-		info = &pmd->nb_info;
-	else
-		return -EWOULDBLOCK;
-
 	if (pmd->fail_io)
-		goto out;
+		return -EINVAL;
 
-	r = dm_btree_lookup(info, pmd->root, keys, &value);
-	if (!r)
-		block_time = le64_to_cpu(value);
+	down_read(&pmd->root_lock);
 
-out:
-	up_read(&pmd->root_lock);
+	if (can_issue_io) {
+		info = &pmd->info;
+	} else
+		info = &pmd->nb_info;
 
+	r = dm_btree_lookup(info, pmd->root, keys, &value);
 	if (!r) {
+		uint64_t block_time = 0;
 		dm_block_t exception_block;
 		uint32_t exception_time;
+
+		block_time = le64_to_cpu(value);
 		unpack_block_time(block_time, &exception_block,
 				  &exception_time);
 		result->block = exception_block;
 		result->shared = __snapshotted_since(td, exception_time);
 	}
 
+	up_read(&pmd->root_lock);
 	return r;
 }
 
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index e3c857db195a..efedd5a4cd8f 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -139,12 +139,12 @@ struct dm_thin_lookup_result {
 
 /*
  * Returns:
- *   -EWOULDBLOCK iff @can_block is set and would block.
+ *   -EWOULDBLOCK iff @can_issue_io is set and would issue IO
  *   -ENODATA iff that mapping is not present.
  *   0 success
  */
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-		       int can_block, struct dm_thin_lookup_result *result);
+		       int can_issue_io, struct dm_thin_lookup_result *result);
 
 /*
  * Obtain an unused block.
-- 
cgit v1.2.3


From 4646015d7e4ca5a4dc19427fb0a0aeff15a4fd91 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:27:26 +0100
Subject: dm transaction manager: add support for prefetching blocks of
 metadata

Introduce the dm_tm_issue_prefetches interface.  If you're using a
non-blocking clone the tm will build up a list of requested blocks that
weren't in core.  dm_tm_issue_prefetches will request those blocks to be
prefetched.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 .../md/persistent-data/dm-transaction-manager.c    | 77 +++++++++++++++++++++-
 .../md/persistent-data/dm-transaction-manager.h    |  7 ++
 2 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 3bc30a0ae3d6..9cb797d800cf 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -10,6 +10,8 @@
 #include "dm-persistent-data-internal.h"
 
 #include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/hash.h>
 #include <linux/slab.h>
 #include <linux/device-mapper.h>
 
@@ -17,6 +19,61 @@
 
 /*----------------------------------------------------------------*/
 
+#define PREFETCH_SIZE 128
+#define PREFETCH_BITS 7
+#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
+
+struct prefetch_set {
+	struct mutex lock;
+	dm_block_t blocks[PREFETCH_SIZE];
+};
+
+static unsigned prefetch_hash(dm_block_t b)
+{
+	return hash_64(b, PREFETCH_BITS);
+}
+
+static void prefetch_wipe(struct prefetch_set *p)
+{
+	unsigned i;
+	for (i = 0; i < PREFETCH_SIZE; i++)
+		p->blocks[i] = PREFETCH_SENTINEL;
+}
+
+static void prefetch_init(struct prefetch_set *p)
+{
+	mutex_init(&p->lock);
+	prefetch_wipe(p);
+}
+
+static void prefetch_add(struct prefetch_set *p, dm_block_t b)
+{
+	unsigned h = prefetch_hash(b);
+
+	mutex_lock(&p->lock);
+	if (p->blocks[h] == PREFETCH_SENTINEL)
+		p->blocks[h] = b;
+
+	mutex_unlock(&p->lock);
+}
+
+static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
+{
+	unsigned i;
+
+	mutex_lock(&p->lock);
+
+	for (i = 0; i < PREFETCH_SIZE; i++)
+		if (p->blocks[i] != PREFETCH_SENTINEL) {
+			dm_bm_prefetch(bm, p->blocks[i]);
+			p->blocks[i] = PREFETCH_SENTINEL;
+		}
+
+	mutex_unlock(&p->lock);
+}
+
+/*----------------------------------------------------------------*/
+
 struct shadow_info {
 	struct hlist_node hlist;
 	dm_block_t where;
@@ -37,6 +94,8 @@ struct dm_transaction_manager {
 
 	spinlock_t lock;
 	struct hlist_head buckets[DM_HASH_SIZE];
+
+	struct prefetch_set prefetches;
 };
 
 /*----------------------------------------------------------------*/
@@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 	for (i = 0; i < DM_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(tm->buckets + i);
 
+	prefetch_init(&tm->prefetches);
+
 	return tm;
 }
 
@@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
 		    struct dm_block_validator *v,
 		    struct dm_block **blk)
 {
-	if (tm->is_clone)
-		return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+	if (tm->is_clone) {
+		int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+
+		if (r == -EWOULDBLOCK)
+			prefetch_add(&tm->real->prefetches, b);
+
+		return r;
+	}
 
 	return dm_bm_read_lock(tm->bm, b, v, blk);
 }
@@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
 	return tm->bm;
 }
 
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
+{
+	prefetch_issue(&tm->prefetches, tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
+
 /*----------------------------------------------------------------*/
 
 static int dm_tm_create_internal(struct dm_block_manager *bm,
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index 2772ed2a781a..2e0d4d66fb1b 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -108,6 +108,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
 
 struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
 
+/*
+ * If you're using a non-blocking clone the tm will build up a list of
+ * requested blocks that weren't in core.  This call will request those
+ * blocks to be prefetched.
+ */
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
+
 /*
  * A little utility that ties the knot by producing a transaction manager
  * that has a space map managed by the transaction manager...
-- 
cgit v1.2.3


From 8a01a6af75f839ff8eb25dab69b49224e855bfa1 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:28:30 +0100
Subject: dm thin: prefetch missing metadata pages

Prefetch metadata at the start of the worker thread and then again every
128th bio processed from the deferred list.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c |  5 +++++
 drivers/md/dm-thin-metadata.h |  5 +++++
 drivers/md/dm-thin.c          | 10 ++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index ee42d1c52387..43adbb863f5a 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1809,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
 
 	return needs_check;
 }
+
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
+{
+	dm_tm_issue_prefetches(pmd->tm);
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index efedd5a4cd8f..921d15ee56a0 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
 
+/*
+ * Issue any prefetches that may be useful.
+ */
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+
 /*----------------------------------------------------------------*/
 
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index eecfe7495232..97a7eb4d0412 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1526,6 +1526,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	struct bio *bio;
 	struct bio_list bios;
 	struct blk_plug plug;
+	unsigned count = 0;
 
 	if (tc->requeue_mode) {
 		requeue_bio_list(tc, &tc->deferred_bio_list);
@@ -1567,6 +1568,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 			pool->process_discard(tc, bio);
 		else
 			pool->process_bio(tc, bio);
+
+		if ((count++ & 127) == 0) {
+			dm_pool_issue_prefetches(pool->pmd);
+		}
 	}
 	blk_finish_plug(&plug);
 }
@@ -1652,6 +1657,7 @@ static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
+	dm_pool_issue_prefetches(pool->pmd);
 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	process_deferred_bios(pool);
@@ -1996,10 +2002,6 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		/* fall through */
 
 	case -EWOULDBLOCK:
-		/*
-		 * In future, the failed dm_thin_find_block above could
-		 * provide the hint to load the metadata into cache.
-		 */
 		thin_defer_bio(tc, bio);
 		cell_defer_no_holder_no_free(tc, &cell1);
 		return DM_MAPIO_SUBMITTED;
-- 
cgit v1.2.3


From 7d327fe051edcccf54da7b6733c58992473f228b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 6 Oct 2014 15:45:59 +0100
Subject: dm thin: throttle incoming IO

Throttle IO based on the time it's taking the worker to do one loop.
There were reports of hung task timeouts occuring and it was observed
that the excessively long avgqu-sz (as reported by iostat) was
contributing to these hung tasks.

Throttling definitely helps dm-thinp perform better under heavy IO load
(without being detremental by being overzealous).  It reduces avgqu-sz
drastically, e.g.: from 60K to ~6K, and even as low as 150 once metadata
is cached by bufio, when dirty_ratio=5, dirty_background_ratio=2.  And
avgqu-sz stays at or below 30K even with dirty_ratio=20,
dirty_background_ratio=10.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 97a7eb4d0412..91b430b883fd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -126,6 +126,53 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 
 /*----------------------------------------------------------------*/
 
+#define THROTTLE_THRESHOLD (1 * HZ)
+
+struct throttle {
+	struct rw_semaphore lock;
+	unsigned long threshold;
+	bool throttle_applied;
+};
+
+static void throttle_init(struct throttle *t)
+{
+	init_rwsem(&t->lock);
+	t->throttle_applied = false;
+}
+
+static void throttle_work_start(struct throttle *t)
+{
+	t->threshold = jiffies + THROTTLE_THRESHOLD;
+}
+
+static void throttle_work_update(struct throttle *t)
+{
+	if (!t->throttle_applied && jiffies > t->threshold) {
+		down_write(&t->lock);
+		t->throttle_applied = true;
+	}
+}
+
+static void throttle_work_complete(struct throttle *t)
+{
+	if (t->throttle_applied) {
+		t->throttle_applied = false;
+		up_write(&t->lock);
+	}
+}
+
+static void throttle_lock(struct throttle *t)
+{
+	down_read(&t->lock);
+}
+
+static void throttle_unlock(struct throttle *t)
+{
+	up_read(&t->lock);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * A pool device ties together a metadata device and a data device.  It
  * also provides the interface for creating and destroying internal
@@ -175,6 +222,7 @@ struct pool {
 	struct dm_kcopyd_client *copier;
 
 	struct workqueue_struct *wq;
+	struct throttle throttle;
 	struct work_struct worker;
 	struct delayed_work waker;
 	struct delayed_work no_space_timeout;
@@ -1570,6 +1618,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 			pool->process_bio(tc, bio);
 
 		if ((count++ & 127) == 0) {
+			throttle_work_update(&pool->throttle);
 			dm_pool_issue_prefetches(pool->pmd);
 		}
 	}
@@ -1657,10 +1706,15 @@ static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 
+	throttle_work_start(&pool->throttle);
 	dm_pool_issue_prefetches(pool->pmd);
+	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
+	throttle_work_update(&pool->throttle);
 	process_deferred_bios(pool);
+	throttle_work_complete(&pool->throttle);
 }
 
 /*
@@ -1900,6 +1954,15 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 	wake_worker(pool);
 }
 
+static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+
+	throttle_lock(&pool->throttle);
+	thin_defer_bio(tc, bio);
+	throttle_unlock(&pool->throttle);
+}
+
 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1937,7 +2000,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	}
 
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
-		thin_defer_bio(tc, bio);
+		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
@@ -2220,6 +2283,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_wq;
 	}
 
+	throttle_init(&pool->throttle);
 	INIT_WORK(&pool->worker, do_worker);
 	INIT_DELAYED_WORK(&pool->waker, do_waker);
 	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
-- 
cgit v1.2.3


From 604ea90641b45f41f8dee34ce45694f1e0c53a5a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 18:43:25 -0400
Subject: dm thin: adjust max_sectors_kb based on thinp blocksize

Allows for filesystems to submit bios that are a factor of the thinp
blocksize, improving dm-thinp efficiency (particularly when the data
volume is RAID).

Also set io_min to max_sectors_kb if it is a factor of the thinp
blocksize.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 91b430b883fd..de55ae9d4926 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,6 +11,7 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/log2.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/init.h>
@@ -3242,15 +3243,42 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
-	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+	/*
+	 * Adjust max_sectors_kb to highest possible power-of-2
+	 * factor of pool->sectors_per_block.
+	 */
+	if (limits->max_hw_sectors & (limits->max_hw_sectors - 1))
+		limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors);
+	else
+		limits->max_sectors = limits->max_hw_sectors;
+
+	if (limits->max_sectors < pool->sectors_per_block) {
+		while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
+			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+				limits->max_sectors--;
+			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+		}
+	} else if (block_size_is_power_of_two(pool)) {
+		/* max_sectors_kb is >= power-of-2 thinp blocksize */
+		while (!is_factor(limits->max_sectors, pool->sectors_per_block)) {
+			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+				limits->max_sectors--;
+			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+		}
+	}
 
 	/*
 	 * If the system-determined stacked limits are compatible with the
 	 * pool's blocksize (io_opt is a factor) do not override them.
 	 */
 	if (io_opt_sectors < pool->sectors_per_block ||
-	    do_div(io_opt_sectors, pool->sectors_per_block)) {
-		blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
+	    !is_factor(io_opt_sectors, pool->sectors_per_block)) {
+		if (is_factor(pool->sectors_per_block, limits->max_sectors))
+			blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+		else
+			blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
 	}
 
-- 
cgit v1.2.3


From 148e51baf8e7ae2070ec47c2a0ec05ddf6a47da1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 19:32:22 -0400
Subject: dm: improve documentation and code clarity in dm_merge_bvec

These code changes do not introduce a functional change.

But bio_add_page() will never attempt to build up a bio larger than
queue_max_sectors().  Similarly, bio_get_nr_vecs() is also bound by
queue_max_sectors().  Therefore, there is no point in allowing
dm_merge_bvec() to answer "how many sectors can a bio have at this
offset?" with anything larger than queue_max_sectors().  Using
queue_max_sectors() rather than BIO_MAX_SECTORS serves to more
accurately convey the limits that are being imposed.

Also, use unlikely() to clarify the fact that the defensive code in
dm_merge_bvec() relative to max_size going negative shouldn't ever
happen -- if it does happen there is a bug in the block layer for
requesting larger than dm_merge_bvec()'s initial response for a given
offset.  Also, update a comment in dm_merge_bvec() relative to
max_hw_sectors_kb.  And fix empty newline whitespace.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..0fee0e54d36f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1607,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * Find maximum amount of I/O that won't need splitting
 	 */
 	max_sectors = min(max_io_len(bvm->bi_sector, ti),
-			  (sector_t) BIO_MAX_SECTORS);
+			  (sector_t) queue_max_sectors(q));
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-	if (max_size < 0)
+	if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
 		max_size = 0;
 
 	/*
@@ -1621,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q,
 		max_size = ti->type->merge(ti, bvm, biovec, max_size);
 	/*
 	 * If the target doesn't support merge method and some of the devices
-	 * provided their merge_bvec method (we know this by looking at
-	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
-	 * entries.  So always set max_size to 0, and the code below allows
-	 * just one page.
+	 * provided their merge_bvec method (we know this by looking for the
+	 * max_hw_sectors that dm_set_device_limits may set), then we can't
+	 * allow bios with multiple vector entries.  So always set max_size
+	 * to 0, and the code below allows just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
 		max_size = 0;
-- 
cgit v1.2.3


From 36f12aeb714fc04752997d6c07b6afb2fa0ac947 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 15:24:12 -0400
Subject: dm thin: implement thin_merge

Introduce thin_merge so that any additional constraints from the data
volume may be taken into account when determing the maximum number of
sectors that can be issued relative to the specified logical offset.

This is particularly important if/when the data volume is layered ontop
of a more sophisticated device (e.g. dm-raid or some other DM target).

Reviewed-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index de55ae9d4926..068607828691 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3307,7 +3307,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3634,6 +3634,21 @@ err:
 	DMEMIT("Error");
 }
 
+static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		      struct bio_vec *biovec, int max_size)
+{
+	struct thin_c *tc = ti->private;
+	struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = tc->pool_dev->bdev;
+	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
@@ -3658,7 +3673,7 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
@@ -3668,6 +3683,7 @@ static struct target_type thin_target = {
 	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
+	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
 };
 
-- 
cgit v1.2.3


From 7a7e97ca580b944d2c89b59bc74a7b9ddd044705 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 12 Sep 2014 11:34:01 +0100
Subject: dm thin: performance improvement to discard processing

When processing a discard bio, if the block is already quiesced do the
discard immediately rather than adding the mapping to a list for the
next iteration of the worker thread.

Discarding a fully provisioned 100G thin volume with 64k block size goes
from 860s to 95s with this change.

Clearly there's something wrong with the worker architecture, more
investigation needed.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 068607828691..8c3d048dd319 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1194,7 +1194,6 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 static void process_discard(struct thin_c *tc, struct bio *bio)
 {
 	int r;
-	unsigned long flags;
 	struct pool *pool = tc->pool;
 	struct dm_bio_prison_cell *cell, *cell2;
 	struct dm_cell_key key, key2;
@@ -1235,12 +1234,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			m->cell2 = cell2;
 			m->bio = bio;
 
-			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
-				spin_lock_irqsave(&pool->lock, flags);
-				list_add_tail(&m->list, &pool->prepared_discards);
-				spin_unlock_irqrestore(&pool->lock, flags);
-				wake_worker(pool);
-			}
+			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+				pool->process_prepared_discard(m);
+
 		} else {
 			inc_all_io_entry(pool, bio);
 			cell_defer_no_holder(tc, cell);
-- 
cgit v1.2.3


From 452d7a620dc38cb525c403aa4b445028da359268 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Oct 2014 19:20:21 -0400
Subject: dm thin: factor out remap_and_issue_overwrite

Purely cleanup of duplicated code, no functional change.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8c3d048dd319..52562710f6a0 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -890,6 +890,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 	}
 }
 
+static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
+				      dm_block_t data_block,
+				      struct dm_thin_new_mapping *m)
+{
+	struct pool *pool = tc->pool;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+	h->overwrite_mapping = m;
+	m->bio = bio;
+	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+	inc_all_io_entry(pool, bio);
+	remap_and_issue(tc, bio, data_block);
+}
+
 /*
  * A partial copy also needs to zero the uncopied region.
  */
@@ -924,15 +938,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	 * If the whole block of data is being overwritten, we can issue the
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
-	if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
-		h->overwrite_mapping = m;
-		m->bio = bio;
-		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		inc_all_io_entry(pool, bio);
-		remap_and_issue(tc, bio, data_dest);
-	} else {
+	if (io_overwrites_block(pool, bio))
+		remap_and_issue_overwrite(tc, bio, data_dest, m);
+	else {
 		struct dm_io_region from, to;
 
 		from.bdev = origin->bdev;
@@ -1001,16 +1009,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	if (!pool->pf.zero_new_blocks)
 		process_prepared_mapping(m);
 
-	else if (io_overwrites_block(pool, bio)) {
-		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
-
-		h->overwrite_mapping = m;
-		m->bio = bio;
-		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-		inc_all_io_entry(pool, bio);
-		remap_and_issue(tc, bio, data_block);
+	else if (io_overwrites_block(pool, bio))
+		remap_and_issue_overwrite(tc, bio, data_block, m);
 
-	} else
+	else
 		ll_zero(tc, m,
 			data_block * pool->sectors_per_block,
 			(data_block + 1) * pool->sectors_per_block);
-- 
cgit v1.2.3


From a374bb217b449a00eb96d0584bb833a8b62b672a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Oct 2014 13:43:14 +0100
Subject: dm thin: defer whole cells rather than individual bios

This avoids dropping the cell, so increases the probability that other
bios will collect within the cell, rather than being passed individually
to the worker.

Also add required process_cell and process_discard_cell error handling
wrappers and set associated pool-mode function pointers accordingly.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.h |   1 +
 drivers/md/dm-thin.c       | 254 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 208 insertions(+), 47 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 997a43960e77..c0cddb118582 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -35,6 +35,7 @@ struct dm_cell_key {
  * themselves.
  */
 struct dm_bio_prison_cell {
+	struct list_head user_list;	/* for client use */
 	struct rb_node node;
 
 	struct dm_cell_key key;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 52562710f6a0..912d7f4d89d1 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -202,6 +202,7 @@ struct pool_features {
 
 struct thin_c;
 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 
 struct pool {
@@ -246,6 +247,9 @@ struct pool {
 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
 
+	process_cell_fn process_cell;
+	process_cell_fn process_discard_cell;
+
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
 };
@@ -282,6 +286,7 @@ struct thin_c {
 	struct dm_thin_device *td;
 	bool requeue_mode:1;
 	spinlock_t lock;
+	struct list_head deferred_cells;
 	struct bio_list deferred_bio_list;
 	struct bio_list retry_on_resume_list;
 	struct rb_root sort_bio_list; /* sorted list of deferred bios */
@@ -346,19 +351,6 @@ static void cell_release_no_holder(struct pool *pool,
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
-static void cell_defer_no_holder_no_free(struct thin_c *tc,
-					 struct dm_bio_prison_cell *cell)
-{
-	struct pool *pool = tc->pool;
-	unsigned long flags;
-
-	spin_lock_irqsave(&tc->lock, flags);
-	dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
-	spin_unlock_irqrestore(&tc->lock, flags);
-
-	wake_worker(pool);
-}
-
 static void cell_error_with_code(struct pool *pool,
 				 struct dm_bio_prison_cell *cell, int error_code)
 {
@@ -371,6 +363,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 	cell_error_with_code(pool, cell, -EIO);
 }
 
+static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(pool, cell, 0);
+}
+
+static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -458,10 +460,28 @@ static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 		bio_endio(bio, DM_ENDIO_REQUEUE);
 }
 
+static void requeue_deferred_cells(struct thin_c *tc)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&tc->lock, flags);
+	list_splice_init(&tc->deferred_cells, &cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list)
+		cell_requeue(pool, cell);
+}
+
 static void requeue_io(struct thin_c *tc)
 {
 	requeue_bio_list(tc, &tc->deferred_bio_list);
 	requeue_bio_list(tc, &tc->retry_on_resume_list);
+	requeue_deferred_cells(tc);
 }
 
 static void error_thin_retry_list(struct thin_c *tc)
@@ -706,6 +726,28 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 	wake_worker(pool);
 }
 
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+static void inc_remap_and_issue_cell(struct thin_c *tc,
+				     struct dm_bio_prison_cell *cell,
+				     dm_block_t block)
+{
+	struct bio *bio;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
+	cell_release_no_holder(tc->pool, cell, &bios);
+
+	while ((bio = bio_list_pop(&bios))) {
+		if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+			thin_defer_bio(tc, bio);
+		else {
+			inc_all_io_entry(tc->pool, bio);
+			remap_and_issue(tc, bio, block);
+		}
+	}
+}
+
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
@@ -1193,19 +1235,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 			retry_on_resume(bio);
 }
 
-static void process_discard(struct thin_c *tc, struct bio *bio)
+static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
+	struct bio *bio = cell->holder;
 	struct pool *pool = tc->pool;
-	struct dm_bio_prison_cell *cell, *cell2;
-	struct dm_cell_key key, key2;
+	struct dm_bio_prison_cell *cell2;
+	struct dm_cell_key key2;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	struct dm_thin_new_mapping *m;
 
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool, &key, bio, &cell))
+	if (tc->requeue_mode) {
+		cell_requeue(pool, cell);
 		return;
+	}
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
@@ -1273,6 +1317,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 	}
 }
 
+static void process_discard_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct dm_bio_prison_cell *cell;
+	struct dm_cell_key key;
+	dm_block_t block = get_bio_block(tc, bio);
+
+	build_virtual_key(tc->td, block, &key);
+	if (bio_detain(tc->pool, &key, bio, &cell))
+		return;
+
+	process_discard_cell(tc, cell);
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			  struct dm_cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
@@ -1379,34 +1436,30 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	}
 }
 
-static void process_bio(struct thin_c *tc, struct bio *bio)
+static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
 	struct pool *pool = tc->pool;
+	struct bio *bio = cell->holder;
 	dm_block_t block = get_bio_block(tc, bio);
-	struct dm_bio_prison_cell *cell;
-	struct dm_cell_key key;
 	struct dm_thin_lookup_result lookup_result;
 
-	/*
-	 * If cell is already occupied, then the block is already
-	 * being provisioned so we have nothing further to do here.
-	 */
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(pool, &key, bio, &cell))
+	if (tc->requeue_mode) {
+		cell_requeue(pool, cell);
 		return;
+	}
 
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
 		if (lookup_result.shared) {
 			process_shared_bio(tc, bio, block, &lookup_result);
+			// FIXME: we can't remap because we're waiting on a commit
 			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
 		} else {
 			inc_all_io_entry(pool, bio);
-			cell_defer_no_holder(tc, cell);
-
 			remap_and_issue(tc, bio, lookup_result.block);
+			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 
@@ -1440,7 +1493,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	}
 }
 
-static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+static void process_bio(struct thin_c *tc, struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+	dm_block_t block = get_bio_block(tc, bio);
+	struct dm_bio_prison_cell *cell;
+	struct dm_cell_key key;
+
+	/*
+	 * If cell is already occupied, then the block is already
+	 * being provisioned so we have nothing further to do here.
+	 */
+	build_virtual_key(tc->td, block, &key);
+	if (bio_detain(pool, &key, bio, &cell))
+		return;
+
+	process_cell(tc, cell);
+}
+
+static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
+				    struct dm_bio_prison_cell *cell)
 {
 	int r;
 	int rw = bio_data_dir(bio);
@@ -1450,15 +1522,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
+		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
 			handle_unserviceable_bio(tc->pool, bio);
-		else {
+			if (cell)
+				cell_defer_no_holder(tc, cell);
+		} else {
 			inc_all_io_entry(tc->pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
+			if (cell)
+				inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 
 	case -ENODATA:
+		if (cell)
+			cell_defer_no_holder(tc, cell);
 		if (rw != READ) {
 			handle_unserviceable_bio(tc->pool, bio);
 			break;
@@ -1477,11 +1555,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	default:
 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
 			    __func__, r);
+		if (cell)
+			cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
 }
 
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+	__process_bio_read_only(tc, bio, NULL);
+}
+
+static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	__process_bio_read_only(tc, cell->holder, cell);
+}
+
 static void process_bio_success(struct thin_c *tc, struct bio *bio)
 {
 	bio_endio(bio, 0);
@@ -1492,6 +1582,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 	bio_io_error(bio);
 }
 
+static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	cell_success(tc->pool, cell);
+}
+
+static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	cell_error(tc->pool, cell);
+}
+
 /*
  * FIXME: should we also commit due to size of transaction, measured in
  * metadata blocks?
@@ -1624,6 +1724,45 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	blk_finish_plug(&plug);
 }
 
+static void process_thin_deferred_cells(struct thin_c *tc)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&tc->lock, flags);
+	list_splice_init(&tc->deferred_cells, &cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+
+	if (list_empty(&cells))
+		return;
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+		BUG_ON(!cell->holder);
+
+		/*
+		 * If we've got no free new_mapping structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
+		 */
+		if (ensure_next_mapping(pool)) {
+			spin_lock_irqsave(&tc->lock, flags);
+			list_add(&cell->user_list, &tc->deferred_cells);
+			list_splice(&cells, &tc->deferred_cells);
+			spin_unlock_irqrestore(&tc->lock, flags);
+			break;
+		}
+
+		if (cell->holder->bi_rw & REQ_DISCARD)
+			pool->process_discard_cell(tc, cell);
+		else
+			pool->process_cell(tc, cell);
+	}
+}
+
 static void thin_get(struct thin_c *tc);
 static void thin_put(struct thin_c *tc);
 
@@ -1672,6 +1811,7 @@ static void process_deferred_bios(struct pool *pool)
 
 	tc = get_first_thin(pool);
 	while (tc) {
+		process_thin_deferred_cells(tc);
 		process_thin_deferred_bios(tc);
 		tc = get_next_thin(pool, tc);
 	}
@@ -1850,6 +1990,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
+		pool->process_cell = process_cell_fail;
+		pool->process_discard_cell = process_cell_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
 
@@ -1862,6 +2004,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_bio_success;
+		pool->process_cell = process_cell_read_only;
+		pool->process_discard_cell = process_cell_success;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
 
@@ -1880,7 +2024,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "out-of-data-space");
 		pool->process_bio = process_bio_read_only;
-		pool->process_discard = process_discard;
+		pool->process_discard = process_discard_bio;
+		pool->process_cell = process_cell_read_only;
+		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
 
@@ -1893,7 +2039,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 			notify_of_pool_mode_change(pool, "write");
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
-		pool->process_discard = process_discard;
+		pool->process_discard = process_discard_bio;
+		pool->process_cell = process_cell;
+		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard;
 		break;
@@ -1962,6 +2110,20 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
 	throttle_unlock(&pool->throttle);
 }
 
+static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+	unsigned long flags;
+	struct pool *pool = tc->pool;
+
+	throttle_lock(&pool->throttle);
+	spin_lock_irqsave(&tc->lock, flags);
+	list_add_tail(&cell->user_list, &tc->deferred_cells);
+	spin_unlock_irqrestore(&tc->lock, flags);
+	throttle_unlock(&pool->throttle);
+
+	wake_worker(pool);
+}
+
 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1982,8 +2144,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
-	struct dm_bio_prison_cell cell1, cell2;
-	struct dm_bio_prison_cell *cell_result;
+	struct dm_bio_prison_cell *virt_cell, *data_cell;
 	struct dm_cell_key key;
 
 	thin_hook_bio(tc, bio);
@@ -2008,7 +2169,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	 * there's a race with discard.
 	 */
 	build_virtual_key(tc->td, block, &key);
-	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
+	if (bio_detain(tc->pool, &key, bio, &virt_cell))
 		return DM_MAPIO_SUBMITTED;
 
 	r = dm_thin_find_block(td, block, 0, &result);
@@ -2033,20 +2194,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 			 * More distant ancestors are irrelevant. The
 			 * shared flag will be set in their case.
 			 */
-			thin_defer_bio(tc, bio);
-			cell_defer_no_holder_no_free(tc, &cell1);
+			thin_defer_cell(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 
 		build_data_key(tc->td, result.block, &key);
-		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
-			cell_defer_no_holder_no_free(tc, &cell1);
+		if (bio_detain(tc->pool, &key, bio, &data_cell)) {
+			cell_defer_no_holder(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 
 		inc_all_io_entry(tc->pool, bio);
-		cell_defer_no_holder_no_free(tc, &cell2);
-		cell_defer_no_holder_no_free(tc, &cell1);
+		cell_defer_no_holder(tc, data_cell);
+		cell_defer_no_holder(tc, virt_cell);
 
 		remap(tc, bio, result.block);
 		return DM_MAPIO_REMAPPED;
@@ -2058,14 +2218,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 			 * of doing so.
 			 */
 			handle_unserviceable_bio(tc->pool, bio);
-			cell_defer_no_holder_no_free(tc, &cell1);
+			cell_defer_no_holder(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 		/* fall through */
 
 	case -EWOULDBLOCK:
-		thin_defer_bio(tc, bio);
-		cell_defer_no_holder_no_free(tc, &cell1);
+		thin_defer_cell(tc, virt_cell);
 		return DM_MAPIO_SUBMITTED;
 
 	default:
@@ -2075,7 +2234,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		 * pool is switched to fail-io mode.
 		 */
 		bio_io_error(bio);
-		cell_defer_no_holder_no_free(tc, &cell1);
+		cell_defer_no_holder(tc, virt_cell);
 		return DM_MAPIO_SUBMITTED;
 	}
 }
@@ -3394,6 +3553,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto out_unlock;
 	}
 	spin_lock_init(&tc->lock);
+	INIT_LIST_HEAD(&tc->deferred_cells);
 	bio_list_init(&tc->deferred_bio_list);
 	bio_list_init(&tc->retry_on_resume_list);
 	tc->sort_bio_list = RB_ROOT;
-- 
cgit v1.2.3


From 2d759a46b4d65e1392843cf9df7101897af87008 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Oct 2014 15:27:16 +0100
Subject: dm thin: remap the bios in a cell immediately

This use of direct submission in process_prepared_mapping() reduces
latency for submitting bios in a cell by avoiding adding those bios to
the deferred list and waiting for the next iteration of the worker.

But this direct submission exposes the potential for a race between
releasing a cell and incrementing deferred set.  Fix this by introducing
dm_cell_visit_release() and refactoring inc_remap_and_issue_cell()
accordingly.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c | 14 ++++++++
 drivers/md/dm-bio-prison.h |  8 +++++
 drivers/md/dm-thin.c       | 90 +++++++++++++++++++++++++++++++---------------
 3 files changed, 83 insertions(+), 29 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index 90a56625245a..bbe22a5dc06b 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -241,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_cell_error);
 
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+			   void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+			   void *context,
+			   struct dm_bio_prison_cell *cell)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	visit_fn(context, cell);
+	rb_erase(&cell->node, &prison->cells);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+
 /*----------------------------------------------------------------*/
 
 #define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index c0cddb118582..b03988667740 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -89,6 +89,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 void dm_cell_error(struct dm_bio_prison *prison,
 		   struct dm_bio_prison_cell *cell, int error);
 
+/*
+ * Visits the cell and then releases.  Guarantees no new inmates are
+ * inserted between the visit and release.
+ */
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+			   void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+			   void *context, struct dm_bio_prison_cell *cell);
+
 /*----------------------------------------------------------------*/
 
 /*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 912d7f4d89d1..5036d4b3f368 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -343,6 +343,15 @@ static void cell_release(struct pool *pool,
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
+static void cell_visit_release(struct pool *pool,
+			       void (*fn)(void *, struct dm_bio_prison_cell *),
+			       void *context,
+			       struct dm_bio_prison_cell *cell)
+{
+	dm_cell_visit_release(pool->prison, fn, context, cell);
+	dm_bio_prison_free_cell(pool->prison, cell);
+}
+
 static void cell_release_no_holder(struct pool *pool,
 				   struct dm_bio_prison_cell *cell,
 				   struct bio_list *bios)
@@ -697,55 +706,75 @@ static void overwrite_endio(struct bio *bio, int err)
  */
 
 /*
- * This sends the bios in the cell back to the deferred_bios list.
+ * This sends the bios in the cell, except the original holder, back
+ * to the deferred_bios list.
  */
-static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 
 	spin_lock_irqsave(&tc->lock, flags);
-	cell_release(pool, cell, &tc->deferred_bio_list);
+	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 
 	wake_worker(pool);
 }
 
-/*
- * Same as cell_defer above, except it omits the original holder of the cell.
- */
-static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+struct remap_info {
+	struct thin_c *tc;
+	struct bio_list defer_bios;
+	struct bio_list issue_bios;
+};
+
+static void __inc_remap_and_issue_cell(void *context,
+				       struct dm_bio_prison_cell *cell)
 {
-	struct pool *pool = tc->pool;
-	unsigned long flags;
+	struct remap_info *info = context;
+	struct bio *bio;
 
-	spin_lock_irqsave(&tc->lock, flags);
-	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
-	spin_unlock_irqrestore(&tc->lock, flags);
+	while ((bio = bio_list_pop(&cell->bios))) {
+		if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+			bio_list_add(&info->defer_bios, bio);
+		else {
+			inc_all_io_entry(info->tc->pool, bio);
 
-	wake_worker(pool);
+			/*
+			 * We can't issue the bios with the bio prison lock
+			 * held, so we add them to a list to issue on
+			 * return from this function.
+			 */
+			bio_list_add(&info->issue_bios, bio);
+		}
+	}
 }
 
-static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
-
 static void inc_remap_and_issue_cell(struct thin_c *tc,
 				     struct dm_bio_prison_cell *cell,
 				     dm_block_t block)
 {
 	struct bio *bio;
-	struct bio_list bios;
+	struct remap_info info;
 
-	bio_list_init(&bios);
-	cell_release_no_holder(tc->pool, cell, &bios);
+	info.tc = tc;
+	bio_list_init(&info.defer_bios);
+	bio_list_init(&info.issue_bios);
 
-	while ((bio = bio_list_pop(&bios))) {
-		if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
-			thin_defer_bio(tc, bio);
-		else {
-			inc_all_io_entry(tc->pool, bio);
-			remap_and_issue(tc, bio, block);
-		}
-	}
+	/*
+	 * We have to be careful to inc any bios we're about to issue
+	 * before the cell is released, and avoid a race with new bios
+	 * being added to the cell.
+	 */
+	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
+			   &info, cell);
+
+	while ((bio = bio_list_pop(&info.defer_bios)))
+		thin_defer_bio(tc, bio);
+
+	while ((bio = bio_list_pop(&info.issue_bios)))
+		remap_and_issue(info.tc, bio, block);
 }
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
@@ -796,10 +825,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 * the bios in the cell.
 	 */
 	if (bio) {
-		cell_defer_no_holder(tc, m->cell);
+		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
 		bio_endio(bio, 0);
-	} else
-		cell_defer(tc, m->cell);
+	} else {
+		inc_all_io_entry(tc->pool, m->cell->holder);
+		remap_and_issue(tc, m->cell->holder, m->data_block);
+		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
+	}
 
 out:
 	list_del(&m->list);
-- 
cgit v1.2.3


From 23ca2bb6c6104db9d4cff4e33cbabee303c49d4d Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 15 Oct 2014 14:46:58 +0100
Subject: dm thin: direct dispatch when breaking sharing

This use of direct submission in process_shared_bio() reduces latency
for submitting bios in the shared cell by avoiding adding those bios to
the deferred list and waiting for the next iteration of the worker.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 70 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 57 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5036d4b3f368..3f3a66124d46 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1390,11 +1390,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 	}
 }
 
+static void __remap_and_issue_shared_cell(void *context,
+					  struct dm_bio_prison_cell *cell)
+{
+	struct remap_info *info = context;
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&cell->bios))) {
+		if ((bio_data_dir(bio) == WRITE) ||
+		    (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
+			bio_list_add(&info->defer_bios, bio);
+		else {
+			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
+
+			h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
+			inc_all_io_entry(info->tc->pool, bio);
+			bio_list_add(&info->issue_bios, bio);
+		}
+	}
+}
+
+static void remap_and_issue_shared_cell(struct thin_c *tc,
+					struct dm_bio_prison_cell *cell,
+					dm_block_t block)
+{
+	struct bio *bio;
+	struct remap_info info;
+
+	info.tc = tc;
+	bio_list_init(&info.defer_bios);
+	bio_list_init(&info.issue_bios);
+
+	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
+			   &info, cell);
+
+	while ((bio = bio_list_pop(&info.defer_bios)))
+		thin_defer_bio(tc, bio);
+
+	while ((bio = bio_list_pop(&info.issue_bios)))
+		remap_and_issue(tc, bio, block);
+}
+
 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 			       dm_block_t block,
-			       struct dm_thin_lookup_result *lookup_result)
+			       struct dm_thin_lookup_result *lookup_result,
+			       struct dm_bio_prison_cell *virt_cell)
 {
-	struct dm_bio_prison_cell *cell;
+	struct dm_bio_prison_cell *data_cell;
 	struct pool *pool = tc->pool;
 	struct dm_cell_key key;
 
@@ -1403,19 +1445,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	 * of being broken so we have nothing further to do here.
 	 */
 	build_data_key(tc->td, lookup_result->block, &key);
-	if (bio_detain(pool, &key, bio, &cell))
+	if (bio_detain(pool, &key, bio, &data_cell)) {
+		cell_defer_no_holder(tc, virt_cell);
 		return;
+	}
 
-	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
-		break_sharing(tc, bio, block, &key, lookup_result, cell);
-	else {
+	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
+		break_sharing(tc, bio, block, &key, lookup_result, data_cell);
+		cell_defer_no_holder(tc, virt_cell);
+	} else {
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 
 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
 		inc_all_io_entry(pool, bio);
-		cell_defer_no_holder(tc, cell);
-
 		remap_and_issue(tc, bio, lookup_result->block);
+
+		remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
+		remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
 	}
 }
 
@@ -1484,11 +1530,9 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
-		if (lookup_result.shared) {
-			process_shared_bio(tc, bio, block, &lookup_result);
-			// FIXME: we can't remap because we're waiting on a commit
-			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
-		} else {
+		if (lookup_result.shared)
+			process_shared_bio(tc, bio, block, &lookup_result, cell);
+		else {
 			inc_all_io_entry(pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
 			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
-- 
cgit v1.2.3


From ac4c3f34a9af63092b3fbfafe34c3e966fbd96c5 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Oct 2014 16:42:10 +0100
Subject: dm thin: sort the deferred cells

Sort the cells in logical block order before processing each cell in
process_thin_deferred_cells().  This significantly improves the ondisk
layout on rotational storage, whereby improving read performance.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 88 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 3f3a66124d46..b9d25026ab84 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
 #include <linux/rbtree.h>
 
 #define	DM_MSG_PREFIX	"thin"
@@ -205,6 +206,8 @@ typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 
+#define CELL_SORT_ARRAY_SIZE 8192
+
 struct pool {
 	struct list_head list;
 	struct dm_target *ti;	/* Only set if a pool target is bound */
@@ -252,6 +255,8 @@ struct pool {
 
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
+
+	struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
@@ -1800,12 +1805,48 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	blk_finish_plug(&plug);
 }
 
+static int cmp_cells(const void *lhs, const void *rhs)
+{
+	struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
+	struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
+
+	BUG_ON(!lhs_cell->holder);
+	BUG_ON(!rhs_cell->holder);
+
+	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
+		return -1;
+
+	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
+		return 1;
+
+	return 0;
+}
+
+static unsigned sort_cells(struct pool *pool, struct list_head *cells)
+{
+	unsigned count = 0;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	list_for_each_entry_safe(cell, tmp, cells, user_list) {
+		if (count >= CELL_SORT_ARRAY_SIZE)
+			break;
+
+		pool->cell_sort_array[count++] = cell;
+		list_del(&cell->user_list);
+	}
+
+	sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
+
+	return count;
+}
+
 static void process_thin_deferred_cells(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	struct list_head cells;
-	struct dm_bio_prison_cell *cell, *tmp;
+	struct dm_bio_prison_cell *cell;
+	unsigned i, j, count;
 
 	INIT_LIST_HEAD(&cells);
 
@@ -1816,27 +1857,34 @@ static void process_thin_deferred_cells(struct thin_c *tc)
 	if (list_empty(&cells))
 		return;
 
-	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
-		BUG_ON(!cell->holder);
+	do {
+		count = sort_cells(tc->pool, &cells);
 
-		/*
-		 * If we've got no free new_mapping structs, and processing
-		 * this bio might require one, we pause until there are some
-		 * prepared mappings to process.
-		 */
-		if (ensure_next_mapping(pool)) {
-			spin_lock_irqsave(&tc->lock, flags);
-			list_add(&cell->user_list, &tc->deferred_cells);
-			list_splice(&cells, &tc->deferred_cells);
-			spin_unlock_irqrestore(&tc->lock, flags);
-			break;
-		}
+		for (i = 0; i < count; i++) {
+			cell = pool->cell_sort_array[i];
+			BUG_ON(!cell->holder);
 
-		if (cell->holder->bi_rw & REQ_DISCARD)
-			pool->process_discard_cell(tc, cell);
-		else
-			pool->process_cell(tc, cell);
-	}
+			/*
+			 * If we've got no free new_mapping structs, and processing
+			 * this bio might require one, we pause until there are some
+			 * prepared mappings to process.
+			 */
+			if (ensure_next_mapping(pool)) {
+				for (j = i; j < count; j++)
+					list_add(&pool->cell_sort_array[j]->user_list, &cells);
+
+				spin_lock_irqsave(&tc->lock, flags);
+				list_splice(&cells, &tc->deferred_cells);
+				spin_unlock_irqrestore(&tc->lock, flags);
+				return;
+			}
+
+			if (cell->holder->bi_rw & REQ_DISCARD)
+				pool->process_discard_cell(tc, cell);
+			else
+				pool->process_cell(tc, cell);
+		}
+	} while (!list_empty(&cells));
 }
 
 static void thin_get(struct thin_c *tc);
-- 
cgit v1.2.3


From 9d094eebd7fd3d3432a974f46490c32cae35edfe Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 19 Oct 2014 08:23:09 -0400
Subject: dm thin: optimize retry_bios_on_resume

Eliminate redundant should_error_unserviceable_bio check and error
loop.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index b9d25026ab84..575e3ed723cc 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1263,13 +1263,8 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 	bio_list_init(&bios);
 	cell_release(pool, cell, &bios);
 
-	error = should_error_unserviceable_bio(pool);
-	if (error)
-		while ((bio = bio_list_pop(&bios)))
-			bio_endio(bio, error);
-	else
-		while ((bio = bio_list_pop(&bios)))
-			retry_on_resume(bio);
+	while ((bio = bio_list_pop(&bios)))
+		retry_on_resume(bio);
 }
 
 static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
-- 
cgit v1.2.3


From 42d6a8ce3c3f70bf77a40ace385f59e1b5b9918f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 19 Oct 2014 07:52:44 -0400
Subject: dm thin: refactor requeue_io to eliminate spinlock bouncing

Also refactor some other bio_list erroring helpers.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 575e3ed723cc..fb05f6a4bbfd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -457,21 +457,32 @@ struct dm_thin_endio_hook {
 	struct rb_node rb_node;
 };
 
-static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
+{
+	bio_list_merge(bios, master);
+	bio_list_init(master);
+}
+
+static void error_bio_list(struct bio_list *bios, int error)
 {
 	struct bio *bio;
+
+	while ((bio = bio_list_pop(bios)))
+		bio_endio(bio, error);
+}
+
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+{
 	struct bio_list bios;
 	unsigned long flags;
 
 	bio_list_init(&bios);
 
 	spin_lock_irqsave(&tc->lock, flags);
-	bio_list_merge(&bios, master);
-	bio_list_init(master);
+	__merge_bio_list(&bios, master);
 	spin_unlock_irqrestore(&tc->lock, flags);
 
-	while ((bio = bio_list_pop(&bios)))
-		bio_endio(bio, DM_ENDIO_REQUEUE);
+	error_bio_list(&bios, error);
 }
 
 static void requeue_deferred_cells(struct thin_c *tc)
@@ -493,26 +504,18 @@ static void requeue_deferred_cells(struct thin_c *tc)
 
 static void requeue_io(struct thin_c *tc)
 {
-	requeue_bio_list(tc, &tc->deferred_bio_list);
-	requeue_bio_list(tc, &tc->retry_on_resume_list);
-	requeue_deferred_cells(tc);
-}
-
-static void error_thin_retry_list(struct thin_c *tc)
-{
-	struct bio *bio;
-	unsigned long flags;
 	struct bio_list bios;
+	unsigned long flags;
 
 	bio_list_init(&bios);
 
 	spin_lock_irqsave(&tc->lock, flags);
-	bio_list_merge(&bios, &tc->retry_on_resume_list);
-	bio_list_init(&tc->retry_on_resume_list);
+	__merge_bio_list(&bios, &tc->deferred_bio_list);
+	__merge_bio_list(&bios, &tc->retry_on_resume_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 
-	while ((bio = bio_list_pop(&bios)))
-		bio_io_error(bio);
+	error_bio_list(&bios, DM_ENDIO_REQUEUE);
+	requeue_deferred_cells(tc);
 }
 
 static void error_retry_list(struct pool *pool)
@@ -521,7 +524,7 @@ static void error_retry_list(struct pool *pool)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tc, &pool->active_thins, list)
-		error_thin_retry_list(tc);
+		error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
 	rcu_read_unlock();
 }
 
@@ -1752,7 +1755,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	unsigned count = 0;
 
 	if (tc->requeue_mode) {
-		requeue_bio_list(tc, &tc->deferred_bio_list);
+		error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
 		return;
 	}
 
-- 
cgit v1.2.3


From 33423974bfc1c61193df765078f0466fece7021e Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Tue, 28 Oct 2014 15:09:56 -0700
Subject: dm: Use rcu_dereference() for accessing rcu pointer

The map field in 'struct mapped_device' is an rcu pointer. Use rcu_dereference()
while accessing it.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0fee0e54d36f..c5e14eee8c76 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2332,7 +2332,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	merge_is_optional = dm_table_merge_is_optional(t);
 
-	old_map = md->map;
+	old_map = rcu_dereference(md->map);
 	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
@@ -2351,7 +2351,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
  */
 static struct dm_table *__unbind(struct mapped_device *md)
 {
-	struct dm_table *map = md->map;
+	struct dm_table *map = rcu_dereference(md->map);
 
 	if (!map)
 		return NULL;
@@ -2745,7 +2745,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
-	map = md->map;
+	map = rcu_dereference(md->map);
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2839,7 +2839,7 @@ int dm_resume(struct mapped_device *md)
 	if (!dm_suspended_md(md))
 		goto out;
 
-	map = md->map;
+	map = rcu_dereference(md->map);
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
-- 
cgit v1.2.3


From 6fa9952097747f71c5077f3e14ce3f8adee6f778 Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Tue, 28 Oct 2014 15:09:57 -0700
Subject: dm: sparse: Annotate field with __rcu for checking

Annotate the map field with __rcu since this is a rcu pointer which is checked
by sparse.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c5e14eee8c76..16a806a99b99 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -140,7 +140,7 @@ struct mapped_device {
 	 * Use dm_get_live_table{_fast} or take suspend_lock for
 	 * dereference.
 	 */
-	struct dm_table *map;
+	struct dm_table __rcu *map;
 
 	struct list_head table_devices;
 	struct mutex table_devices_lock;
-- 
cgit v1.2.3


From 41abc4e1af369bb5438eaee398e3beee690cc8ca Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 5 Nov 2014 14:35:50 +0100
Subject: dm: do not call dm_sync_table() when creating new devices

When creating new devices dm_sync_table() calls
synchronize_rcu_expedited(), causing _all_ pending RCU pointers to be
flushed. This causes a latency overhead that is especially noticeable
when creating lots of devices.

And all of this is pointless as there are no old maps to be
disconnected, and hence no stale pointers which would need to be
cleared up.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 16a806a99b99..866ff19aa438 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2341,7 +2341,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	else
 		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-	dm_sync_table(md);
+	if (old_map)
+		dm_sync_table(md);
 
 	return old_map;
 }
@@ -2782,7 +2783,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * flush_workqueue(md->wq).
 	 */
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	synchronize_srcu(&md->io_barrier);
+	if (map)
+		synchronize_srcu(&md->io_barrier);
 
 	/*
 	 * Stop md->queue before flushing md->wq in case request-based
@@ -2802,7 +2804,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-	synchronize_srcu(&md->io_barrier);
+	if (map)
+		synchronize_srcu(&md->io_barrier);
 
 	/* were we interrupted ? */
 	if (r < 0) {
-- 
cgit v1.2.3


From b155aa0e5a81ea1f05ff7aced0ec8e34c980c19e Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 22 Oct 2014 14:30:58 +0100
Subject: dm cache policy mq: tweak algorithm that decides when to promote a
 block

Rather than maintaining a separate promote_threshold variable that we
periodically update we now use the hit count of the oldest clean
block.  Also add a fudge factor to discourage demoting dirty blocks.

With some tests this has a sizeable difference, because the old code
was too eager to demote blocks.  For example, device-mapper-test-suite's
git_extract_cache_quick test goes from taking 190 seconds, to 142
(linear on spindle takes 250).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 75 +++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 0e385e40909e..334d098d720d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -181,24 +181,30 @@ static void queue_shift_down(struct queue *q)
  * Gives us the oldest entry of the lowest popoulated level.  If the first
  * level is emptied then we shift down one level.
  */
-static struct list_head *queue_pop(struct queue *q)
+static struct list_head *queue_peek(struct queue *q)
 {
 	unsigned level;
-	struct list_head *r;
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		if (!list_empty(q->qs + level)) {
-			r = q->qs[level].next;
-			list_del(r);
+		if (!list_empty(q->qs + level))
+			return q->qs[level].next;
 
-			/* have we just emptied the bottom level? */
-			if (level == 0 && list_empty(q->qs))
-				queue_shift_down(q);
+	return NULL;
+}
 
-			return r;
-		}
+static struct list_head *queue_pop(struct queue *q)
+{
+	struct list_head *r = queue_peek(q);
 
-	return NULL;
+	if (r) {
+		list_del(r);
+
+		/* have we just emptied the bottom level? */
+		if (list_empty(q->qs))
+			queue_shift_down(q);
+	}
+
+	return r;
 }
 
 static struct list_head *list_pop(struct list_head *lh)
@@ -383,13 +389,6 @@ struct mq_policy {
 	unsigned generation;
 	unsigned generation_period; /* in lookups (will probably change) */
 
-	/*
-	 * Entries in the pre_cache whose hit count passes the promotion
-	 * threshold move to the cache proper.  Working out the correct
-	 * value for the promotion_threshold is crucial to this policy.
-	 */
-	unsigned promote_threshold;
-
 	unsigned discard_promote_adjustment;
 	unsigned read_promote_adjustment;
 	unsigned write_promote_adjustment;
@@ -406,6 +405,7 @@ struct mq_policy {
 #define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
 #define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
 #define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
 
 /*----------------------------------------------------------------*/
 
@@ -518,6 +518,12 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
 	return e;
 }
 
+static struct entry *peek(struct queue *q)
+{
+	struct list_head *h = queue_peek(q);
+	return h ? container_of(h, struct entry, list) : NULL;
+}
+
 /*
  * Has this entry already been updated?
  */
@@ -570,10 +576,6 @@ static void check_generation(struct mq_policy *mq)
 					break;
 			}
 		}
-
-		mq->promote_threshold = nr ? total / nr : 1;
-		if (mq->promote_threshold * nr < total)
-			mq->promote_threshold++;
 	}
 }
 
@@ -640,6 +642,30 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 	return 0;
 }
 
+/*
+ * Entries in the pre_cache whose hit count passes the promotion
+ * threshold move to the cache proper.  Working out the correct
+ * value for the promotion_threshold is crucial to this policy.
+ */
+static unsigned promote_threshold(struct mq_policy *mq)
+{
+	struct entry *e;
+
+	if (any_free_cblocks(mq))
+		return 0;
+
+	e = peek(&mq->cache_clean);
+	if (e)
+		return e->hit_count;
+
+	e = peek(&mq->cache_dirty);
+	if (e)
+		return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
+
+	/* This should never happen */
+	return 0;
+}
+
 /*
  * We modify the basic promotion_threshold depending on the specific io.
  *
@@ -653,7 +679,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 					   bool discarded_oblock, int data_dir)
 {
 	if (data_dir == READ)
-		return mq->promote_threshold + mq->read_promote_adjustment;
+		return promote_threshold(mq) + mq->read_promote_adjustment;
 
 	if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 		/*
@@ -663,7 +689,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 		return mq->discard_promote_adjustment;
 	}
 
-	return mq->promote_threshold + mq->write_promote_adjustment;
+	return promote_threshold(mq) + mq->write_promote_adjustment;
 }
 
 static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -1230,7 +1256,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 	mq->tick = 0;
 	mq->hit_count = 0;
 	mq->generation = 0;
-	mq->promote_threshold = 0;
 	mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
 	mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
 	mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
-- 
cgit v1.2.3


From f1afb36a6102b52949c2c6d8eb250eddcce3fc5f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 30 Oct 2014 10:02:01 -0400
Subject: dm cache policy mq: simplify ability to promote sequential IO to the
 cache

Before, if the user wanted sequential IO to be promoted to the cache
they'd have to set sequential_threshold to some nebulous large value.

Now, the user may easily disable sequential IO detection (and sequential
IO's implicit bypass of the cache) by setting sequential_threshold to 0.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 334d098d720d..13f547a4eeb6 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -865,7 +865,8 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
 	if (e && in_cache(mq, e))
 		r = cache_entry_found(mq, e, result);
 
-	else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
+	else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
+		 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
 		result->op = POLICY_MISS;
 
 	else if (e)
@@ -1290,7 +1291,7 @@ bad_pre_cache_init:
 
 static struct dm_cache_policy_type mq_policy_type = {
 	.name = "mq",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create
@@ -1298,7 +1299,7 @@ static struct dm_cache_policy_type mq_policy_type = {
 
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create,
-- 
cgit v1.2.3


From 5f274d886598c9fd26d2499bf3f68306f170e9db Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 17 Sep 2014 10:17:39 +0100
Subject: dm bio prison: introduce support for locking ranges of blocks

Ranges will be placed in the same cell if they overlap.

Range locking is a prerequisite for more efficient multi-block discard
support in both the cache and thin-provisioning targets.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c   |  4 ++--
 drivers/md/dm-bio-prison.h   | 12 ++++++++----
 drivers/md/dm-cache-target.c |  3 ++-
 drivers/md/dm-thin.c         |  6 ++++--
 4 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index bbe22a5dc06b..be065300e93c 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -95,10 +95,10 @@ static int cmp_keys(struct dm_cell_key *lhs,
 	if (lhs->dev > rhs->dev)
 		return 1;
 
-	if (lhs->block < rhs->block)
+	if (lhs->block_end <= rhs->block_begin)
 		return -1;
 
-	if (lhs->block > rhs->block)
+	if (lhs->block_begin >= rhs->block_end)
 		return 1;
 
 	return 0;
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index b03988667740..74cf01144b1f 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -23,11 +23,14 @@
  */
 struct dm_bio_prison;
 
-/* FIXME: this needs to be more abstract */
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
 struct dm_cell_key {
 	int virtual;
 	dm_thin_id dev;
-	dm_block_t block;
+	dm_block_t block_begin, block_end;
 };
 
 /*
@@ -59,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
 			     struct dm_bio_prison_cell *cell);
 
 /*
- * Creates, or retrieves a cell for the given key.
+ * Creates, or retrieves a cell that overlaps the given key.
  *
  * Returns 1 if pre-existing cell returned, zero if new cell created using
  * @cell_prealloc.
@@ -70,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison,
 		struct dm_bio_prison_cell **cell_result);
 
 /*
- * An atomic op that combines retrieving a cell, and adding a bio to it.
+ * An atomic op that combines retrieving or creating a cell, and adding a
+ * bio to it.
  *
  * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
  */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 69de8b43ca12..890e2fff4074 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -436,7 +436,8 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 {
 	key->virtual = 0;
 	key->dev = 0;
-	key->block = from_oblock(oblock);
+	key->block_begin = from_oblock(oblock);
+	key->block_end = key->block_begin + 1ULL;
 }
 
 /*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fb05f6a4bbfd..8c5504c0e894 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -115,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td,
 {
 	key->virtual = 0;
 	key->dev = dm_thin_dev_id(td);
-	key->block = b;
+	key->block_begin = b;
+	key->block_end = b + 1ULL;
 }
 
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
@@ -123,7 +124,8 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 {
 	key->virtual = 1;
 	key->dev = dm_thin_dev_id(td);
-	key->block = b;
+	key->block_begin = b;
+	key->block_end = b + 1ULL;
 }
 
 /*----------------------------------------------------------------*/
-- 
cgit v1.2.3


From 1bad9bc4ee899a108499e5eac6baafff018b4d0b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 7 Nov 2014 14:47:07 +0000
Subject: dm cache: revert "remove remainder of distinct discard block size"

This reverts commit 64ab346a360a4b15c28fb8531918d4a01f4eabd9 because we
actually do want to allow the discard blocksize to be larger than the
cache blocksize.  Further dm-cache discard changes will make this
possible.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-block-types.h | 11 ++++++
 drivers/md/dm-cache-metadata.c    | 34 +++++++++---------
 drivers/md/dm-cache-metadata.h    |  6 ++--
 drivers/md/dm-cache-target.c      | 72 +++++++++++++++++++++++++--------------
 4 files changed, 77 insertions(+), 46 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index aac0e2df06be..bed4ad4e1b7c 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,6 +19,7 @@
 
 typedef dm_block_t __bitwise__ dm_oblock_t;
 typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
 
 static inline dm_oblock_t to_oblock(dm_block_t b)
 {
@@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b)
 	return (__force uint32_t) b;
 }
 
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+	return (__force dm_dblock_t) b;
+}
+
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+	return (__force dm_block_t) b;
+}
+
 #endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 06709257adde..9fc616c2755e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
 	dm_block_t discard_root;
 
 	sector_t discard_block_size;
-	dm_oblock_t discard_nr_blocks;
+	dm_dblock_t discard_nr_blocks;
 
 	sector_t data_block_size;
 	dm_cblock_t cache_blocks;
@@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
 	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
 	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
-	disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+	disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
 	disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
 	disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
 	disk_super->cache_blocks = cpu_to_le32(0);
@@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
 	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
 	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
 	cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
-	cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks));
+	cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
 	cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
 	cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
 	strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
 	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
 	disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
-	disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+	disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
 	disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
 	strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
 	disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -797,15 +797,15 @@ out:
 
 int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 				   sector_t discard_block_size,
-				   dm_oblock_t new_nr_entries)
+				   dm_dblock_t new_nr_entries)
 {
 	int r;
 
 	down_write(&cmd->root_lock);
 	r = dm_bitset_resize(&cmd->discard_info,
 			     cmd->discard_root,
-			     from_oblock(cmd->discard_nr_blocks),
-			     from_oblock(new_nr_entries),
+			     from_dblock(cmd->discard_nr_blocks),
+			     from_dblock(new_nr_entries),
 			     false, &cmd->discard_root);
 	if (!r) {
 		cmd->discard_block_size = discard_block_size;
@@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 	return r;
 }
 
-static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
 {
 	return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
-				 from_oblock(b), &cmd->discard_root);
+				 from_dblock(b), &cmd->discard_root);
 }
 
-static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
 {
 	return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
-				   from_oblock(b), &cmd->discard_root);
+				   from_dblock(b), &cmd->discard_root);
 }
 
-static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b,
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
 			  bool *is_discarded)
 {
 	return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
-				  from_oblock(b), &cmd->discard_root,
+				  from_dblock(b), &cmd->discard_root,
 				  is_discarded);
 }
 
 static int __discard(struct dm_cache_metadata *cmd,
-		     dm_oblock_t dblock, bool discard)
+		     dm_dblock_t dblock, bool discard)
 {
 	int r;
 
@@ -852,7 +852,7 @@ static int __discard(struct dm_cache_metadata *cmd,
 }
 
 int dm_cache_set_discard(struct dm_cache_metadata *cmd,
-			 dm_oblock_t dblock, bool discard)
+			 dm_dblock_t dblock, bool discard)
 {
 	int r;
 
@@ -870,8 +870,8 @@ static int __load_discards(struct dm_cache_metadata *cmd,
 	dm_block_t b;
 	bool discard;
 
-	for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) {
-		dm_oblock_t dblock = to_oblock(b);
+	for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+		dm_dblock_t dblock = to_dblock(b);
 
 		if (cmd->clean_when_opened) {
 			r = __is_discarded(cmd, dblock, &discard);
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 7383c90ccdb8..4ecc403be283 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -70,14 +70,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
 
 int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 				   sector_t discard_block_size,
-				   dm_oblock_t new_nr_entries);
+				   dm_dblock_t new_nr_entries);
 
 typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
-			       dm_oblock_t dblock, bool discarded);
+			       dm_dblock_t dblock, bool discarded);
 int dm_cache_load_discards(struct dm_cache_metadata *cmd,
 			   load_discard_fn fn, void *context);
 
-int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard);
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
 
 int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
 int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 890e2fff4074..ced7fd4adddb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -236,8 +236,9 @@ struct cache {
 	/*
 	 * origin_blocks entries, discarded if set.
 	 */
-	dm_oblock_t discard_nr_blocks;
+	dm_dblock_t discard_nr_blocks;
 	unsigned long *discard_bitset;
+	uint32_t discard_block_size;
 
 	/*
 	 * Rather than reconstructing the table line for the status we just
@@ -524,33 +525,48 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
 	return b;
 }
 
-static void set_discard(struct cache *cache, dm_oblock_t b)
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+	uint32_t discard_blocks = cache->discard_block_size;
+	dm_block_t b = from_oblock(oblock);
+
+	if (!block_size_is_power_of_two(cache))
+		discard_blocks = discard_blocks / cache->sectors_per_block;
+	else
+		discard_blocks >>= cache->sectors_per_block_shift;
+
+	b = block_div(b, discard_blocks);
+
+	return to_dblock(b);
+}
+
+static void set_discard(struct cache *cache, dm_dblock_t b)
 {
 	unsigned long flags;
 
 	atomic_inc(&cache->stats.discard_count);
 
 	spin_lock_irqsave(&cache->lock, flags);
-	set_bit(from_oblock(b), cache->discard_bitset);
+	set_bit(from_dblock(b), cache->discard_bitset);
 	spin_unlock_irqrestore(&cache->lock, flags);
 }
 
-static void clear_discard(struct cache *cache, dm_oblock_t b)
+static void clear_discard(struct cache *cache, dm_dblock_t b)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&cache->lock, flags);
-	clear_bit(from_oblock(b), cache->discard_bitset);
+	clear_bit(from_dblock(b), cache->discard_bitset);
 	spin_unlock_irqrestore(&cache->lock, flags);
 }
 
-static bool is_discarded(struct cache *cache, dm_oblock_t b)
+static bool is_discarded(struct cache *cache, dm_dblock_t b)
 {
 	int r;
 	unsigned long flags;
 
 	spin_lock_irqsave(&cache->lock, flags);
-	r = test_bit(from_oblock(b), cache->discard_bitset);
+	r = test_bit(from_dblock(b), cache->discard_bitset);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	return r;
@@ -562,7 +578,8 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cache->lock, flags);
-	r = test_bit(from_oblock(b), cache->discard_bitset);
+	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
+		     cache->discard_bitset);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	return r;
@@ -687,7 +704,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 	check_if_tick_bio_needed(cache, bio);
 	remap_to_origin(cache, bio);
 	if (bio_data_dir(bio) == WRITE)
-		clear_discard(cache, oblock);
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
 }
 
 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
@@ -697,7 +714,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 	remap_to_cache(cache, bio, cblock);
 	if (bio_data_dir(bio) == WRITE) {
 		set_dirty(cache, oblock, cblock);
-		clear_discard(cache, oblock);
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
 	}
 }
 
@@ -1301,14 +1318,14 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 static void process_discard_bio(struct cache *cache, struct bio *bio)
 {
 	dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
-						  cache->sectors_per_block);
+						  cache->discard_block_size);
 	dm_block_t end_block = bio_end_sector(bio);
 	dm_block_t b;
 
-	end_block = block_div(end_block, cache->sectors_per_block);
+	end_block = block_div(end_block, cache->discard_block_size);
 
 	for (b = start_block; b < end_block; b++)
-		set_discard(cache, to_oblock(b));
+		set_discard(cache, to_dblock(b));
 
 	bio_endio(bio, 0);
 }
@@ -2303,13 +2320,14 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
 
-	cache->discard_nr_blocks = cache->origin_blocks;
-	cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks));
+	cache->discard_block_size = cache->sectors_per_block;
+	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
+	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
 	if (!cache->discard_bitset) {
 		*error = "could not allocate discard bitset";
 		goto bad;
 	}
-	clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks));
+	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
 
 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(cache->copier)) {
@@ -2599,16 +2617,16 @@ static int write_discard_bitset(struct cache *cache)
 {
 	unsigned i, r;
 
-	r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block,
-					   cache->origin_blocks);
+	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
+					   cache->discard_nr_blocks);
 	if (r) {
 		DMERR("could not resize on-disk discard bitset");
 		return r;
 	}
 
-	for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) {
-		r = dm_cache_set_discard(cache->cmd, to_oblock(i),
-					 is_discarded(cache, to_oblock(i)));
+	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
+		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
+					 is_discarded(cache, to_dblock(i)));
 		if (r)
 			return r;
 	}
@@ -2681,14 +2699,16 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 }
 
 static int load_discard(void *context, sector_t discard_block_size,
-			dm_oblock_t oblock, bool discard)
+			dm_dblock_t dblock, bool discard)
 {
 	struct cache *cache = context;
 
+	/* FIXME: handle mis-matched block size */
+
 	if (discard)
-		set_discard(cache, oblock);
+		set_discard(cache, dblock);
 	else
-		clear_discard(cache, oblock);
+		clear_discard(cache, dblock);
 
 	return 0;
 }
@@ -3079,8 +3099,8 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
 	/*
 	 * FIXME: these limits may be incompatible with the cache device
 	 */
-	limits->max_discard_sectors = cache->sectors_per_block;
-	limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT;
+	limits->max_discard_sectors = cache->discard_block_size;
+	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
 }
 
 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
-- 
cgit v1.2.3


From 08b184514f65d160ce66381dafca5962e3d8f785 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 6 Nov 2014 14:38:01 +0000
Subject: dm cache: revert "prevent corruption caused by discard_block_size >
 cache_block_size"

This reverts commit d132cc6d9e92424bb9d4fd35f5bd0e55d583f4be because we
actually do want to allow the discard blocksize to be larger than the
cache blocksize.  Further dm-cache discard changes will make this
possible.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ced7fd4adddb..c2ca74374944 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -238,7 +238,7 @@ struct cache {
 	 */
 	dm_dblock_t discard_nr_blocks;
 	unsigned long *discard_bitset;
-	uint32_t discard_block_size;
+	uint32_t discard_block_size; /* a power of 2 times sectors per block */
 
 	/*
 	 * Rather than reconstructing the table line for the status we just
@@ -2197,6 +2197,35 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
 	return 0;
 }
 
+/*
+ * We want the discard block size to be a power of two, at least the size
+ * of the cache block size, and have no more than 2^14 discard blocks
+ * across the origin.
+ */
+#define MAX_DISCARD_BLOCKS (1 << 14)
+
+static bool too_many_discard_blocks(sector_t discard_block_size,
+				    sector_t origin_size)
+{
+	(void) sector_div(origin_size, discard_block_size);
+
+	return origin_size > MAX_DISCARD_BLOCKS;
+}
+
+static sector_t calculate_discard_block_size(sector_t cache_block_size,
+					     sector_t origin_size)
+{
+	sector_t discard_block_size;
+
+	discard_block_size = roundup_pow_of_two(cache_block_size);
+
+	if (origin_size)
+		while (too_many_discard_blocks(discard_block_size, origin_size))
+			discard_block_size *= 2;
+
+	return discard_block_size;
+}
+
 #define DEFAULT_MIGRATION_THRESHOLD 2048
 
 static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2320,7 +2349,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
 
-	cache->discard_block_size = cache->sectors_per_block;
+	cache->discard_block_size =
+		calculate_discard_block_size(cache->sectors_per_block,
+					     cache->origin_sectors);
 	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
 	if (!cache->discard_bitset) {
@@ -3099,7 +3130,7 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
 	/*
 	 * FIXME: these limits may be incompatible with the cache device
 	 */
-	limits->max_discard_sectors = cache->discard_block_size;
+	limits->max_discard_sectors = cache->discard_block_size * 1024;
 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
 }
 
-- 
cgit v1.2.3


From 7ae34e7778966d39f66397491eb114b613202c20 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 6 Nov 2014 10:18:04 +0000
Subject: dm cache: improve discard support

Safely allow the discard blocksize to be larger than the cache blocksize
by using the bio prison's range locking support.  This also improves
discard performance considerly because larger discards are issued to the
dm-cache device.  The discard blocksize was always intended to be
greater than the cache blocksize.  But until now it wasn't implemented
safely.

Also, by safely restoring the ability to have discard blocksize larger
than cache blocksize we're able to significantly reduce the memory used
for the cache's discard bitset.  Before, with a small discard blocksize,
the discard bitset could get quite large because its size is a function
of the discard blocksize and the origin device's size.  For example,
previously, using a 32KB cache blocksize with a 40TB origin resulted in
1280MB of incore memory use for the discard bitset!  Now, the discard
blocksize is scaled up accordingly to ensure the discard bitset is
capped at 2**14 bits, or 16KB.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 166 +++++++++++++++++++++++++++++++------------
 1 file changed, 121 insertions(+), 45 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index c2ca74374944..6e36a0753105 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -310,6 +310,7 @@ struct dm_cache_migration {
 	dm_cblock_t cblock;
 
 	bool err:1;
+	bool discard:1;
 	bool writeback:1;
 	bool demote:1;
 	bool promote:1;
@@ -433,12 +434,12 @@ static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cel
 
 /*----------------------------------------------------------------*/
 
-static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
 {
 	key->virtual = 0;
 	key->dev = 0;
-	key->block_begin = from_oblock(oblock);
-	key->block_end = key->block_begin + 1ULL;
+	key->block_begin = from_oblock(begin);
+	key->block_end = from_oblock(end);
 }
 
 /*
@@ -448,15 +449,15 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
  */
 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
-		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-		      cell_free_fn free_fn, void *free_context,
-		      struct dm_bio_prison_cell **cell_result)
+static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
+			    struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+			    cell_free_fn free_fn, void *free_context,
+			    struct dm_bio_prison_cell **cell_result)
 {
 	int r;
 	struct dm_cell_key key;
 
-	build_key(oblock, &key);
+	build_key(oblock_begin, oblock_end, &key);
 	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 	if (r)
 		free_fn(free_context, cell_prealloc);
@@ -464,6 +465,16 @@ static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 	return r;
 }
 
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+		      cell_free_fn free_fn, void *free_context,
+		      struct dm_bio_prison_cell **cell_result)
+{
+	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
+	return bio_detain_range(cache, oblock, end, bio,
+				cell_prealloc, free_fn, free_context, cell_result);
+}
+
 static int get_cell(struct cache *cache,
 		    dm_oblock_t oblock,
 		    struct prealloc *structs,
@@ -475,7 +486,7 @@ static int get_cell(struct cache *cache,
 
 	cell_prealloc = prealloc_get_cell(structs);
 
-	build_key(oblock, &key);
+	build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
 	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 	if (r)
 		prealloc_put_cell(structs, cell_prealloc);
@@ -525,25 +536,34 @@ static dm_block_t block_div(dm_block_t b, uint32_t n)
 	return b;
 }
 
-static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+static dm_block_t oblocks_per_dblock(struct cache *cache)
 {
-	uint32_t discard_blocks = cache->discard_block_size;
-	dm_block_t b = from_oblock(oblock);
+	dm_block_t oblocks = cache->discard_block_size;
 
-	if (!block_size_is_power_of_two(cache))
-		discard_blocks = discard_blocks / cache->sectors_per_block;
+	if (block_size_is_power_of_two(cache))
+		oblocks >>= cache->sectors_per_block_shift;
 	else
-		discard_blocks >>= cache->sectors_per_block_shift;
+		oblocks = block_div(oblocks, cache->sectors_per_block);
 
-	b = block_div(b, discard_blocks);
+	return oblocks;
+}
+
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+	return to_dblock(block_div(from_oblock(oblock),
+				   oblocks_per_dblock(cache)));
+}
 
-	return to_dblock(b);
+static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
+{
+	return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
 }
 
 static void set_discard(struct cache *cache, dm_dblock_t b)
 {
 	unsigned long flags;
 
+	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 	atomic_inc(&cache->stats.discard_count);
 
 	spin_lock_irqsave(&cache->lock, flags);
@@ -995,7 +1015,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	wake_worker(cache);
 }
 
-static void issue_copy_real(struct dm_cache_migration *mg)
+static void issue_copy(struct dm_cache_migration *mg)
 {
 	int r;
 	struct dm_io_region o_region, c_region;
@@ -1074,11 +1094,46 @@ static void avoid_copy(struct dm_cache_migration *mg)
 	migration_success_pre_commit(mg);
 }
 
-static void issue_copy(struct dm_cache_migration *mg)
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+				     dm_dblock_t *b, dm_dblock_t *e)
+{
+	sector_t sb = bio->bi_iter.bi_sector;
+	sector_t se = bio_end_sector(bio);
+
+	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+
+	if (se - sb < cache->discard_block_size)
+		*e = *b;
+	else
+		*e = to_dblock(block_div(se, cache->discard_block_size));
+}
+
+static void issue_discard(struct dm_cache_migration *mg)
+{
+	dm_dblock_t b, e;
+	struct bio *bio = mg->new_ocell->holder;
+
+	calc_discard_block_range(mg->cache, bio, &b, &e);
+	while (b != e) {
+		set_discard(mg->cache, b);
+		b = to_dblock(from_dblock(b) + 1);
+	}
+
+	bio_endio(bio, 0);
+	cell_defer(mg->cache, mg->new_ocell, false);
+	free_migration(mg);
+}
+
+static void issue_copy_or_discard(struct dm_cache_migration *mg)
 {
 	bool avoid;
 	struct cache *cache = mg->cache;
 
+	if (mg->discard) {
+		issue_discard(mg);
+		return;
+	}
+
 	if (mg->writeback || mg->demote)
 		avoid = !is_dirty(cache, mg->cblock) ||
 			is_discarded_oblock(cache, mg->old_oblock);
@@ -1093,7 +1148,7 @@ static void issue_copy(struct dm_cache_migration *mg)
 		}
 	}
 
-	avoid ? avoid_copy(mg) : issue_copy_real(mg);
+	avoid ? avoid_copy(mg) : issue_copy(mg);
 }
 
 static void complete_migration(struct dm_cache_migration *mg)
@@ -1178,6 +1233,7 @@ static void promote(struct cache *cache, struct prealloc *structs,
 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
 
 	mg->err = false;
+	mg->discard = false;
 	mg->writeback = false;
 	mg->demote = false;
 	mg->promote = true;
@@ -1201,6 +1257,7 @@ static void writeback(struct cache *cache, struct prealloc *structs,
 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
 
 	mg->err = false;
+	mg->discard = false;
 	mg->writeback = true;
 	mg->demote = false;
 	mg->promote = false;
@@ -1226,6 +1283,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
 
 	mg->err = false;
+	mg->discard = false;
 	mg->writeback = false;
 	mg->demote = true;
 	mg->promote = true;
@@ -1254,6 +1312,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
 
 	mg->err = false;
+	mg->discard = false;
 	mg->writeback = false;
 	mg->demote = true;
 	mg->promote = false;
@@ -1270,6 +1329,26 @@ static void invalidate(struct cache *cache, struct prealloc *structs,
 	quiesce_migration(mg);
 }
 
+static void discard(struct cache *cache, struct prealloc *structs,
+		    struct dm_bio_prison_cell *cell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->discard = true;
+	mg->writeback = false;
+	mg->demote = false;
+	mg->promote = false;
+	mg->requeue_holder = false;
+	mg->invalidate = false;
+	mg->cache = cache;
+	mg->old_ocell = NULL;
+	mg->new_ocell = cell;
+	mg->start_jiffies = jiffies;
+
+	quiesce_migration(mg);
+}
+
 /*----------------------------------------------------------------
  * bio processing
  *--------------------------------------------------------------*/
@@ -1303,31 +1382,27 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 	issue(cache, bio);
 }
 
-/*
- * People generally discard large parts of a device, eg, the whole device
- * when formatting.  Splitting these large discards up into cache block
- * sized ios and then quiescing (always neccessary for discard) takes too
- * long.
- *
- * We keep it simple, and allow any size of discard to come in, and just
- * mark off blocks on the discard bitset.  No passdown occurs!
- *
- * To implement passdown we need to change the bio_prison such that a cell
- * can have a key that spans many blocks.
- */
-static void process_discard_bio(struct cache *cache, struct bio *bio)
+static void process_discard_bio(struct cache *cache, struct prealloc *structs,
+				struct bio *bio)
 {
-	dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
-						  cache->discard_block_size);
-	dm_block_t end_block = bio_end_sector(bio);
-	dm_block_t b;
+	int r;
+	dm_dblock_t b, e;
+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
 
-	end_block = block_div(end_block, cache->discard_block_size);
+	calc_discard_block_range(cache, bio, &b, &e);
+	if (b == e) {
+		bio_endio(bio, 0);
+		return;
+	}
 
-	for (b = start_block; b < end_block; b++)
-		set_discard(cache, to_dblock(b));
+	cell_prealloc = prealloc_get_cell(structs);
+	r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
+			     (cell_free_fn) prealloc_put_cell,
+			     structs, &new_ocell);
+	if (r > 0)
+		return;
 
-	bio_endio(bio, 0);
+	discard(cache, structs, new_ocell);
 }
 
 static bool spare_migration_bandwidth(struct cache *cache)
@@ -1517,7 +1592,7 @@ static void process_deferred_bios(struct cache *cache)
 		if (bio->bi_rw & REQ_FLUSH)
 			process_flush_bio(cache, bio);
 		else if (bio->bi_rw & REQ_DISCARD)
-			process_discard_bio(cache, bio);
+			process_discard_bio(cache, &structs, bio);
 		else
 			process_bio(cache, &structs, bio);
 	}
@@ -1732,7 +1807,7 @@ static void do_worker(struct work_struct *ws)
 			process_invalidation_requests(cache);
 		}
 
-		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
+		process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
 		process_migrations(cache, &cache->completed_migrations, complete_migration);
 
 		if (commit_if_needed(cache)) {
@@ -3130,7 +3205,8 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
 	/*
 	 * FIXME: these limits may be incompatible with the cache device
 	 */
-	limits->max_discard_sectors = cache->discard_block_size * 1024;
+	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
+					    cache->origin_sectors);
 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
 }
 
@@ -3155,7 +3231,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 5, 0},
+	.version = {1, 6, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
-- 
cgit v1.2.3


From d1d9220cbaeecce910f3ecfeb71cc897a678eb68 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 11 Nov 2014 11:58:32 +0000
Subject: dm cache: emit a warning message if there are a lot of cache blocks

Loading and saving millions of block mappings takes time.  We may as
well explain what's going on, and encourage people to use a larger
cache block size.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6e36a0753105..abdd45d07bf6 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2301,6 +2301,19 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
 	return discard_block_size;
 }
 
+static void set_cache_size(struct cache *cache, dm_cblock_t size)
+{
+	dm_block_t nr_blocks = from_cblock(size);
+
+	if (nr_blocks > (1 << 20) && cache->cache_size != size)
+		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
+			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
+			     "Please consider increasing the cache block size to reduce the overall cache block count.",
+			     (unsigned long long) nr_blocks);
+
+	cache->cache_size = size;
+}
+
 #define DEFAULT_MIGRATION_THRESHOLD 2048
 
 static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2356,10 +2369,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 		cache->sectors_per_block_shift = -1;
 		cache_size = block_div(cache_size, ca->block_size);
-		cache->cache_size = to_cblock(cache_size);
+		set_cache_size(cache, to_cblock(cache_size));
 	} else {
 		cache->sectors_per_block_shift = __ffs(ca->block_size);
-		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
+		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
 	}
 
 	r = create_cache_policy(cache, ca, error);
@@ -2856,7 +2869,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 		return r;
 	}
 
-	cache->cache_size = new_size;
+	set_cache_size(cache, new_size);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 17181fb7a0c3a279196c0eeb2caba65a1519614b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 5 Nov 2014 17:00:13 -0500
Subject: dm thin: fix a race in thin_dtr

As long as struct thin_c is in the list, anyone can grab a reference of
it.  Consequently, we must wait for the reference count to drop to zero
*after* we remove the structure from the list, not before.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8c5504c0e894..767417a28b6f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3622,14 +3622,14 @@ static void thin_dtr(struct dm_target *ti)
 	struct thin_c *tc = ti->private;
 	unsigned long flags;
 
-	thin_put(tc);
-	wait_for_completion(&tc->can_destroy);
-
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_del_rcu(&tc->list);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	synchronize_rcu();
 
+	thin_put(tc);
+	wait_for_completion(&tc->can_destroy);
+
 	mutex_lock(&dm_thin_pool_table.mutex);
 
 	__pool_dec(tc->pool);
-- 
cgit v1.2.3


From 5ec02084f60f1537df850817fb91a16072cba4e7 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 7 Nov 2014 15:27:56 -0500
Subject: dm thin: remove stale 'trim' message in block comment above
 pool_message

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 767417a28b6f..64fd4de2986f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3296,7 +3296,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct
  *   create_thin	<dev_id>
  *   create_snap	<dev_id> <origin_id>
  *   delete		<dev_id>
- *   trim		<dev_id> <new_size_in_sectors>
  *   set_transaction_id <current_trans_id> <new_trans_id>
  *   reserve_metadata_snap
  *   release_metadata_snap
-- 
cgit v1.2.3


From 4d341d8216336174d35cd2575b6b9e4267a88ac8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 16 Nov 2014 14:21:47 -0500
Subject: dm: return earlier from dm_blk_ioctl if target doesn't implement
 .ioctl

No point checking if the device is suspended if the current target
doesn't even implement .ioctl

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 866ff19aa438..f8cdd97c28a7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -525,14 +525,15 @@ retry:
 		goto out;
 
 	tgt = dm_table_get_target(map, 0);
+	if (!tgt->type->ioctl)
+		goto out;
 
 	if (dm_suspended_md(md)) {
 		r = -EAGAIN;
 		goto out;
 	}
 
-	if (tgt->type->ioctl)
-		r = tgt->type->ioctl(tgt, cmd, arg);
+	r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
 	dm_put_live_table(md, srcu_idx);
-- 
cgit v1.2.3


From d67ee213fa5700c7da526fe5bcccd485cfa63d8b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 28 Oct 2014 20:13:31 -0400
Subject: dm: add presuspend_undo hook to target_type

The DM thin-pool target now must undo the changes performed during
pool_presuspend() so introduce presuspend_undo hook in target_type.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++-------
 drivers/md/dm.c       | 10 ++++++++--
 drivers/md/dm.h       |  1 +
 3 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b2bd1ebf4562..3afae9e062f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_get_mode);
 
-static void suspend_targets(struct dm_table *t, unsigned postsuspend)
+enum suspend_mode {
+	PRESUSPEND,
+	PRESUSPEND_UNDO,
+	POSTSUSPEND,
+};
+
+static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
 {
 	int i = t->num_targets;
 	struct dm_target *ti = t->targets;
 
 	while (i--) {
-		if (postsuspend) {
+		switch (mode) {
+		case PRESUSPEND:
+			if (ti->type->presuspend)
+				ti->type->presuspend(ti);
+			break;
+		case PRESUSPEND_UNDO:
+			if (ti->type->presuspend_undo)
+				ti->type->presuspend_undo(ti);
+			break;
+		case POSTSUSPEND:
 			if (ti->type->postsuspend)
 				ti->type->postsuspend(ti);
-		} else if (ti->type->presuspend)
-			ti->type->presuspend(ti);
-
+			break;
+		}
 		ti++;
 	}
 }
@@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t)
 	if (!t)
 		return;
 
-	suspend_targets(t, 0);
+	suspend_targets(t, PRESUSPEND);
+}
+
+void dm_table_presuspend_undo_targets(struct dm_table *t)
+{
+	if (!t)
+		return;
+
+	suspend_targets(t, PRESUSPEND_UNDO);
 }
 
 void dm_table_postsuspend_targets(struct dm_table *t)
@@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 	if (!t)
 		return;
 
-	suspend_targets(t, 1);
+	suspend_targets(t, POSTSUSPEND);
 }
 
 int dm_table_resume_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f8cdd97c28a7..f84de3215982 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2756,7 +2756,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (noflush)
 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 
-	/* This does not get reverted if there's an error later. */
+	/*
+	 * This gets reverted if there's an error later and the targets
+	 * provide the .presuspend_undo hook.
+	 */
 	dm_table_presuspend_targets(map);
 
 	/*
@@ -2767,8 +2770,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 */
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
-		if (r)
+		if (r) {
+			dm_table_presuspend_undo_targets(map);
 			goto out_unlock;
+		}
 	}
 
 	/*
@@ -2816,6 +2821,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 			start_queue(md->queue);
 
 		unlock_fs(md);
+		dm_table_presuspend_undo_targets(map);
 		goto out_unlock; /* pushback list is already flushed, so skip flush */
 	}
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 988c7fb7b145..781994093bf5 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-- 
cgit v1.2.3


From 80e96c5484be788f277eead9cabf88cf8e430419 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 7 Nov 2014 15:09:46 -0500
Subject: dm thin: do not allow thin device activation while pool is suspended

Otherwise IO could be issued to the pool while it is suspended.

Care was taken to properly interlock between the thin and thin-pool
targets when accessing the pool's 'suspended' flag.  The thin_ctr will
not add a new thin device to the pool's active_thins list if the pool is
susepended.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 55 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 64fd4de2986f..f1b53e31d868 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -224,6 +224,7 @@ struct pool {
 
 	struct pool_features pf;
 	bool low_water_triggered:1;	/* A dm event has been sent */
+	bool suspended:1;
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -2575,6 +2576,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	INIT_LIST_HEAD(&pool->prepared_discards);
 	INIT_LIST_HEAD(&pool->active_thins);
 	pool->low_water_triggered = false;
+	pool->suspended = true;
 
 	pool->shared_read_ds = dm_deferred_set_create();
 	if (!pool->shared_read_ds) {
@@ -3119,12 +3121,36 @@ static void pool_resume(struct dm_target *ti)
 
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->low_water_triggered = false;
+	pool->suspended = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
+
 	requeue_bios(pool);
 
 	do_waker(&pool->waker.work);
 }
 
+static void pool_presuspend(struct dm_target *ti)
+{
+	struct pool_c *pt = ti->private;
+	struct pool *pool = pt->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	pool->suspended = true;
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void pool_presuspend_undo(struct dm_target *ti)
+{
+	struct pool_c *pt = ti->private;
+	struct pool *pool = pt->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	pool->suspended = false;
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
 static void pool_postsuspend(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
@@ -3592,6 +3618,8 @@ static struct target_type pool_target = {
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
 	.map = pool_map,
+	.presuspend = pool_presuspend,
+	.presuspend_undo = pool_presuspend_undo,
 	.postsuspend = pool_postsuspend,
 	.preresume = pool_preresume,
 	.resume = pool_resume,
@@ -3721,18 +3749,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		ti->error = "Couldn't open thin device, Pool is in fail mode";
 		r = -EINVAL;
-		goto bad_thin_open;
+		goto bad_pool;
 	}
 
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	if (r) {
 		ti->error = "Couldn't open thin internal device";
-		goto bad_thin_open;
+		goto bad_pool;
 	}
 
 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
 	if (r)
-		goto bad_target_max_io_len;
+		goto bad;
 
 	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
@@ -3747,14 +3775,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		ti->split_discard_bios = true;
 	}
 
-	dm_put(pool_md);
-
 	mutex_unlock(&dm_thin_pool_table.mutex);
 
-	atomic_set(&tc->refcount, 1);
-	init_completion(&tc->can_destroy);
-
 	spin_lock_irqsave(&tc->pool->lock, flags);
+	if (tc->pool->suspended) {
+		spin_unlock_irqrestore(&tc->pool->lock, flags);
+		mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
+		ti->error = "Unable to activate thin device while pool is suspended";
+		r = -EINVAL;
+		goto bad;
+	}
 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	/*
@@ -3765,11 +3795,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	synchronize_rcu();
 
+	dm_put(pool_md);
+
+	atomic_set(&tc->refcount, 1);
+	init_completion(&tc->can_destroy);
+
 	return 0;
 
-bad_target_max_io_len:
+bad:
 	dm_pool_close_thin_device(tc->td);
-bad_thin_open:
+bad_pool:
 	__pool_dec(tc->pool);
 bad_pool_lookup:
 	dm_put(pool_md);
-- 
cgit v1.2.3


From ffcc39364160663cda1a3c358f4537302a92459b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 28 Oct 2014 18:34:52 -0400
Subject: dm: enhance internal suspend and resume interface

Rename dm_internal_{suspend,resume} to dm_internal_{suspend,resume}_fast
-- dm-stats will continue using these methods to avoid all the extra
suspend/resume logic that is not needed in order to quickly flush IO.

Introduce dm_internal_suspend_noflush() variant that actually calls the
mapped_device's target callbacks -- otherwise target-specific hooks are
avoided (e.g. dm-thin's thin_presuspend and thin_postsuspend).  Common
code between dm_internal_{suspend_noflush,resume} and
dm_{suspend,resume} was factored out as __dm_{suspend,resume}.

Update dm_internal_{suspend_noflush,resume} to always take and release
the mapped_device's suspend_lock.  Also update dm_{suspend,resume} to be
aware of potential for DM_INTERNAL_SUSPEND_FLAG to be set and respond
accordingly by interruptibly waiting for the DM_INTERNAL_SUSPEND_FLAG to
be cleared.  Add lockdep annotation to dm_suspend() and dm_resume().

The existing DM_SUSPEND_FLAG remains unchanged.
DM_INTERNAL_SUSPEND_FLAG is set by dm_internal_suspend_noflush() and
cleared by dm_internal_resume().

Both DM_SUSPEND_FLAG and DM_INTERNAL_SUSPEND_FLAG may be set if a device
was already suspended when dm_internal_suspend_noflush() was called --
this can be thought of as a "nested suspend".  A "nested suspend" can
occur with legacy userspace dm-thin code that might suspend all active
thin volumes before suspending the pool for resize.

But otherwise, in the normal dm-thin-pool suspend case moving forward:
the thin-pool will have DM_SUSPEND_FLAG set and all active thins from
that thin-pool will have DM_INTERNAL_SUSPEND_FLAG set.

Also add DM_INTERNAL_SUSPEND_FLAG to status report.  This new
DM_INTERNAL_SUSPEND_FLAG state is being reported to assist with
debugging (e.g. 'dmsetup info' will report an internally suspended
device accordingly).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-ioctl.c |   5 +-
 drivers/md/dm-stats.c |   2 +-
 drivers/md/dm.c       | 229 ++++++++++++++++++++++++++++++++++++++------------
 drivers/md/dm.h       |   9 ++
 4 files changed, 187 insertions(+), 58 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0be9381365d7..73f791bb9ea4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	int srcu_idx;
 
 	param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
-			  DM_ACTIVE_PRESENT_FLAG);
+			  DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG);
 
 	if (dm_suspended_md(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
+	if (dm_suspended_internally_md(md))
+		param->flags |= DM_INTERNAL_SUSPEND_FLAG;
+
 	if (dm_test_deferred_remove_flag(md))
 		param->flags |= DM_DEFERRED_REMOVE;
 
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 28a90122a5a8..42a057aa3811 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md,
 		return 1;
 
 	id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
-			     dm_internal_suspend, dm_internal_resume, md);
+			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
 	if (id < 0)
 		return id;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f84de3215982..a0ece87ad426 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/wait.h>
 
 #include <trace/events/block.h>
 
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
 #define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
 
 /*
  * A dummy definition to make RCU happy.
@@ -2718,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md)
 }
 
 /*
- * We need to be able to change a mapping table under a mounted
- * filesystem.  For example we might want to move some data in
- * the background.  Before the table can be swapped with
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
-/*
- * Suspend mechanism in request-based dm.
- *
- * 1. Flush all I/Os by lock_fs() if needed.
- * 2. Stop dispatching any I/O by stopping the request_queue.
- * 3. Wait for all in-flight I/Os to be completed or requeued.
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
  *
- * To abort suspend, start the request_queue.
+ * Caller must hold md->suspend_lock
  */
-int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+			unsigned suspend_flags, int interruptible)
 {
-	struct dm_table *map = NULL;
-	int r = 0;
-	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
-	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
-
-	mutex_lock(&md->suspend_lock);
-
-	if (dm_suspended_md(md)) {
-		r = -EINVAL;
-		goto out_unlock;
-	}
-
-	map = rcu_dereference(md->map);
+	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+	int r;
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2772,7 +2756,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		r = lock_fs(md);
 		if (r) {
 			dm_table_presuspend_undo_targets(map);
-			goto out_unlock;
+			return r;
 		}
 	}
 
@@ -2806,7 +2790,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 * We call dm_wait_for_completion to wait for all existing requests
 	 * to finish.
 	 */
-	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
+	r = dm_wait_for_completion(md, interruptible);
 
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
@@ -2822,14 +2806,55 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 		unlock_fs(md);
 		dm_table_presuspend_undo_targets(map);
-		goto out_unlock; /* pushback list is already flushed, so skip flush */
+		/* pushback list is already flushed, so skip flush */
 	}
 
-	/*
-	 * If dm_wait_for_completion returned 0, the device is completely
-	 * quiescent now. There is no request-processing activity. All new
-	 * requests are being added to md->deferred list.
-	 */
+	return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem.  For example we might want to move some data in
+ * the background.  Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+	struct dm_table *map = NULL;
+	int r = 0;
+
+retry:
+	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+	if (dm_suspended_md(md)) {
+		r = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (dm_suspended_internally_md(md)) {
+		/* already internally suspended, wait for internal resume */
+		mutex_unlock(&md->suspend_lock);
+		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+		if (r)
+			return r;
+		goto retry;
+	}
+
+	map = rcu_dereference(md->map);
+
+	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+	if (r)
+		goto out_unlock;
 
 	set_bit(DMF_SUSPENDED, &md->flags);
 
@@ -2840,35 +2865,57 @@ out_unlock:
 	return r;
 }
 
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
+{
+	if (map) {
+		int r = dm_table_resume_targets(map);
+		if (r)
+			return r;
+	}
+
+	dm_queue_flush(md);
+
+	/*
+	 * Flushing deferred I/Os must be done after targets are resumed
+	 * so that mapping of targets can work correctly.
+	 * Request-based dm is queueing the deferred I/Os in its request_queue.
+	 */
+	if (dm_request_based(md))
+		start_queue(md->queue);
+
+	unlock_fs(md);
+
+	return 0;
+}
+
 int dm_resume(struct mapped_device *md)
 {
 	int r = -EINVAL;
 	struct dm_table *map = NULL;
 
-	mutex_lock(&md->suspend_lock);
+retry:
+	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
 	if (!dm_suspended_md(md))
 		goto out;
 
+	if (dm_suspended_internally_md(md)) {
+		/* already internally suspended, wait for internal resume */
+		mutex_unlock(&md->suspend_lock);
+		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+		if (r)
+			return r;
+		goto retry;
+	}
+
 	map = rcu_dereference(md->map);
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
-	r = dm_table_resume_targets(map);
+	r = __dm_resume(md, map);
 	if (r)
 		goto out;
 
-	dm_queue_flush(md);
-
-	/*
-	 * Flushing deferred I/Os must be done after targets are resumed
-	 * so that mapping of targets can work correctly.
-	 * Request-based dm is queueing the deferred I/Os in its request_queue.
-	 */
-	if (dm_request_based(md))
-		start_queue(md->queue);
-
-	unlock_fs(md);
-
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	r = 0;
@@ -2882,15 +2929,80 @@ out:
  * Internal suspend/resume works like userspace-driven suspend. It waits
  * until all bios finish and prevents issuing new bios to the target drivers.
  * It may be used only from the kernel.
- *
- * Internal suspend holds md->suspend_lock, which prevents interaction with
- * userspace-driven suspend.
  */
 
-void dm_internal_suspend(struct mapped_device *md)
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
-	mutex_lock(&md->suspend_lock);
+	struct dm_table *map = NULL;
+
+	if (dm_suspended_internally_md(md))
+		return; /* nested internal suspend */
+
+	if (dm_suspended_md(md)) {
+		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+		return; /* nest suspend */
+	}
+
+	map = rcu_dereference(md->map);
+
+	/*
+	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
+	 * would require changing .presuspend to return an error -- avoid this
+	 * until there is a need for more elaborate variants of internal suspend.
+	 */
+	(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+	set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+	dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+	if (!dm_suspended_internally_md(md))
+		return; /* resume from nested internal suspend */
+
 	if (dm_suspended_md(md))
+		goto done; /* resume from nested suspend */
+
+	/*
+	 * NOTE: existing callers don't need to call dm_table_resume_targets
+	 * (which may fail -- so best to avoid it for now by passing NULL map)
+	 */
+	(void) __dm_resume(md, NULL);
+
+done:
+	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+	mutex_lock(&md->suspend_lock);
+	__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+	mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+	mutex_lock(&md->suspend_lock);
+	__dm_internal_resume(md);
+	mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+	mutex_lock(&md->suspend_lock);
+	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
 		return;
 
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2899,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md)
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 }
 
-void dm_internal_resume(struct mapped_device *md)
+void dm_internal_resume_fast(struct mapped_device *md)
 {
-	if (dm_suspended_md(md))
+	if (dm_suspended_md(md) || dm_suspended_internally_md(md))
 		goto done;
 
 	dm_queue_flush(md);
@@ -2987,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md)
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
 int dm_test_deferred_remove_flag(struct mapped_device *md)
 {
 	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 781994093bf5..84b0f9e4ba6c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -129,6 +129,15 @@ int dm_deleting_md(struct mapped_device *md);
  */
 int dm_suspended_md(struct mapped_device *md);
 
+/*
+ * Internal suspend and resume methods.
+ */
+int dm_suspended_internally_md(struct mapped_device *md);
+void dm_internal_suspend_fast(struct mapped_device *md);
+void dm_internal_resume_fast(struct mapped_device *md);
+void dm_internal_suspend_noflush(struct mapped_device *md);
+void dm_internal_resume(struct mapped_device *md);
+
 /*
  * Test if the device is scheduled for deferred remove.
  */
-- 
cgit v1.2.3


From 583024d248f486e21479d1912aa2093565455770 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 28 Oct 2014 20:58:45 -0400
Subject: dm thin: suspend/resume active thin devices when reloading thin-pool

Before this change it was expected that userspace would first suspend
all active thin devices, reload/resize the thin-pool target, then resume
all active thin devices.  Now the thin-pool suspend/resume will trigger
the suspend/resume of all active thins via appropriate calls to
dm_internal_suspend and dm_internal_resume.

Store the mapped_device for each thin device in struct thin_c to make
these calls possible.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index f1b53e31d868..e9e9584fe769 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -292,6 +292,8 @@ struct thin_c {
 
 	struct pool *pool;
 	struct dm_thin_device *td;
+	struct mapped_device *thin_md;
+
 	bool requeue_mode:1;
 	spinlock_t lock;
 	struct list_head deferred_cells;
@@ -3113,19 +3115,48 @@ static int pool_preresume(struct dm_target *ti)
 	return 0;
 }
 
+static void pool_suspend_active_thins(struct pool *pool)
+{
+	struct thin_c *tc;
+
+	/* Suspend all active thin devices */
+	tc = get_first_thin(pool);
+	while (tc) {
+		dm_internal_suspend_noflush(tc->thin_md);
+		tc = get_next_thin(pool, tc);
+	}
+}
+
+static void pool_resume_active_thins(struct pool *pool)
+{
+	struct thin_c *tc;
+
+	/* Resume all active thin devices */
+	tc = get_first_thin(pool);
+	while (tc) {
+		dm_internal_resume(tc->thin_md);
+		tc = get_next_thin(pool, tc);
+	}
+}
+
 static void pool_resume(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 
+	/*
+	 * Must requeue active_thins' bios and then resume
+	 * active_thins _before_ clearing 'suspend' flag.
+	 */
+	requeue_bios(pool);
+	pool_resume_active_thins(pool);
+
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->low_water_triggered = false;
 	pool->suspended = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	requeue_bios(pool);
-
 	do_waker(&pool->waker.work);
 }
 
@@ -3138,6 +3169,8 @@ static void pool_presuspend(struct dm_target *ti)
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->suspended = true;
 	spin_unlock_irqrestore(&pool->lock, flags);
+
+	pool_suspend_active_thins(pool);
 }
 
 static void pool_presuspend_undo(struct dm_target *ti)
@@ -3146,6 +3179,8 @@ static void pool_presuspend_undo(struct dm_target *ti)
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 
+	pool_resume_active_thins(pool);
+
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->suspended = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
@@ -3703,6 +3738,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		r = -ENOMEM;
 		goto out_unlock;
 	}
+	tc->thin_md = dm_table_get_md(ti->table);
 	spin_lock_init(&tc->lock);
 	INIT_LIST_HEAD(&tc->deferred_cells);
 	bio_list_init(&tc->deferred_bio_list);
-- 
cgit v1.2.3


From d200c30ef00dd03aec6f1aeaac1546c6e515cbc0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 20 Nov 2014 18:07:43 -0500
Subject: dm thin: fix pool_io_hints to avoid looking at max_hw_sectors

Simplify the pool_io_hints code that works to establish a max_sectors
value that is a power-of-2 factor of the thin-pool's blocksize.  The
biggest associated improvement is that the DM thin-pool is no longer
concerning itself with the data device's max_hw_sectors when adjusting
max_sectors.

This fixes the relative fragility of the original "dm thin: adjust
max_sectors_kb based on thinp blocksize" commit that only became
apparent when testing was performed using a DM thin-pool ontop of a
virtio_blk device.  One proposed upstream patch detailed the problems
inherent in virtio_blk: https://lkml.org/lkml/2014/11/20/611

So even though virtio_blk incorrectly set its max_hw_sectors it actually
helped make it clear that we need DM thinp to be tolerant of any future
Linux driver that incorrectly sets max_hw_sectors.

We only need to be concerned with modifying the thin-pool device's
max_sectors limit if it is smaller than the thin-pool's blocksize.  In
this case the value of max_sectors does become a limiting factor when
upper layers (e.g. filesystems) construct their bios.  But if the
hardware can support IOs larger than the thin-pool's blocksize the user
is encouraged to adjust the thin-pool's data device's max_sectors
accordingly -- doing so will enable the thin-pool to inherit the
established user-defined max_sectors.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e9e9584fe769..8735543eacdb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3587,27 +3587,20 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
 
 	/*
-	 * Adjust max_sectors_kb to highest possible power-of-2
-	 * factor of pool->sectors_per_block.
+	 * If max_sectors is smaller than pool->sectors_per_block adjust it
+	 * to the highest possible power-of-2 factor of pool->sectors_per_block.
+	 * This is especially beneficial when the pool's data device is a RAID
+	 * device that has a full stripe width that matches pool->sectors_per_block
+	 * -- because even though partial RAID stripe-sized IOs will be issued to a
+	 *    single RAID stripe; when aggregated they will end on a full RAID stripe
+	 *    boundary.. which avoids additional partial RAID stripe writes cascading
 	 */
-	if (limits->max_hw_sectors & (limits->max_hw_sectors - 1))
-		limits->max_sectors = rounddown_pow_of_two(limits->max_hw_sectors);
-	else
-		limits->max_sectors = limits->max_hw_sectors;
-
 	if (limits->max_sectors < pool->sectors_per_block) {
 		while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
 			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
 				limits->max_sectors--;
 			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
 		}
-	} else if (block_size_is_power_of_two(pool)) {
-		/* max_sectors_kb is >= power-of-2 thinp blocksize */
-		while (!is_factor(limits->max_sectors, pool->sectors_per_block)) {
-			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
-				limits->max_sectors--;
-			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
-		}
 	}
 
 	/*
-- 
cgit v1.2.3


From a12f5d48bdfeb5fe10157ac01c3de29269f457c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 23 Nov 2014 09:34:29 -0800
Subject: dm: use rcu_dereference_protected instead of rcu_dereference

rcu_dereference() should be used in sections protected by rcu_read_lock.

For writers, holding some kind of mutex or lock,
rcu_dereference_protected() is the way to go, adding explicit lockdep
bits.

In __unbind(), we are the last user of this mapped device, so can use
the constant '1' instead of a lockdep_is_held(), not consistent with
other uses of rcu_dereference_protected() which use md->suspend_lock
mutex.

Reported-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 33423974bfc1 ("dm: Use rcu_dereference() for accessing rcu pointer")
Cc: Pranith Kumar <bobby.prani@gmail.com>
[snitzer: allow lines longer than 80 columns, refine subject]
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a0ece87ad426..8f37ed215b19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2335,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	merge_is_optional = dm_table_merge_is_optional(t);
 
-	old_map = rcu_dereference(md->map);
+	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 	rcu_assign_pointer(md->map, t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
@@ -2355,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
  */
 static struct dm_table *__unbind(struct mapped_device *md)
 {
-	struct dm_table *map = rcu_dereference(md->map);
+	struct dm_table *map = rcu_dereference_protected(md->map, 1);
 
 	if (!map)
 		return NULL;
@@ -2850,7 +2850,7 @@ retry:
 		goto retry;
 	}
 
-	map = rcu_dereference(md->map);
+	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 
 	r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
 	if (r)
@@ -2908,7 +2908,7 @@ retry:
 		goto retry;
 	}
 
-	map = rcu_dereference(md->map);
+	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -2943,7 +2943,7 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
 		return; /* nest suspend */
 	}
 
-	map = rcu_dereference(md->map);
+	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 
 	/*
 	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
-- 
cgit v1.2.3


From 8001e87d0ee98787c46f14f5f4f97aced70f119f Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 24 Nov 2014 14:08:57 +0000
Subject: dm array: if resizing the array is a noop set the new root to the old
 one

This could've been quite bad (to return success but not update the new
root to point at the old) but in practice the only known consumer of the
dm array code is the DM cache target.  And the DM cache target passes in
the same old root to array_resize() anyway.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-array.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 1d75b1dc1e2e..e64b61ad0ef3 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -645,8 +645,10 @@ static int array_resize(struct dm_array_info *info, dm_block_t root,
 	int r;
 	struct resize resize;
 
-	if (old_size == new_size)
+	if (old_size == new_size) {
+		*new_root = root;
 		return 0;
+	}
 
 	resize.info = info;
 	resize.root = root;
-- 
cgit v1.2.3


From 2572629a1318eb9e13e70fa59756d7bcfb80319e Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 24 Nov 2014 14:05:16 +0000
Subject: dm cache: fix some issues with the new discard range support

Commit 7ae34e777 ("dm cache: improve discard support") needed to also:
- discontinue having DM core split the discard bios on cache block
  boundaries
- calculate the cache's discard_nr_blocks relative to the determined
  discard_block_size rather than using oblock_to_dblock()

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index abdd45d07bf6..41e7cfdb450d 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2338,8 +2338,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
 	ti->discard_zeroes_data_unsupported = true;
-	/* Discard bios must be split on a block boundary */
-	ti->split_discard_bios = true;
+	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
 	ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2440,7 +2439,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	cache->discard_block_size =
 		calculate_discard_block_size(cache->sectors_per_block,
 					     cache->origin_sectors);
-	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
+	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
+							      cache->discard_block_size));
 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
 	if (!cache->discard_bitset) {
 		*error = "could not allocate discard bitset";
-- 
cgit v1.2.3


From 3e2e1c3098fcc02369f0eea822d0a7914b691567 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 24 Nov 2014 14:06:22 +0000
Subject: dm cache: when reloading a discard bitset allow for a different
 discard block size

The discard block size can change if the origin changes size or if an
old DM cache is upgraded from using a discard block size that was equal
to cache block size.

To fix this an extent of discarded blocks is established for the purpose
of translating the old discard block size to the new in-core discard
block size and set bits.  The old (potentially huge) discard bitset is
left ondisk until it is re-written using the new in-core information on
the next successful DM cache shutdown.

Fixes: 7ae34e777896 ("dm cache: improve discard support")
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 94 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 41e7cfdb450d..2c66315553f2 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2817,17 +2817,86 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 	return 0;
 }
 
+/*
+ * The discard block size in the on disk metadata is not
+ * neccessarily the same as we're currently using.  So we have to
+ * be careful to only set the discarded attribute if we know it
+ * covers a complete block of the new size.
+ */
+struct discard_load_info {
+	struct cache *cache;
+
+	/*
+	 * These blocks are sized using the on disk dblock size, rather
+	 * than the current one.
+	 */
+	dm_block_t block_size;
+	dm_block_t discard_begin, discard_end;
+};
+
+static void discard_load_info_init(struct cache *cache,
+				   struct discard_load_info *li)
+{
+	li->cache = cache;
+	li->discard_begin = li->discard_end = 0;
+}
+
+static void set_discard_range(struct discard_load_info *li)
+{
+	sector_t b, e;
+
+	if (li->discard_begin == li->discard_end)
+		return;
+
+	/*
+	 * Convert to sectors.
+	 */
+	b = li->discard_begin * li->block_size;
+	e = li->discard_end * li->block_size;
+
+	/*
+	 * Then convert back to the current dblock size.
+	 */
+	b = dm_sector_div_up(b, li->cache->discard_block_size);
+	sector_div(e, li->cache->discard_block_size);
+
+	/*
+	 * The origin may have shrunk, so we need to check we're still in
+	 * bounds.
+	 */
+	if (e > from_dblock(li->cache->discard_nr_blocks))
+		e = from_dblock(li->cache->discard_nr_blocks);
+
+	for (; b < e; b++)
+		set_discard(li->cache, to_dblock(b));
+}
+
 static int load_discard(void *context, sector_t discard_block_size,
 			dm_dblock_t dblock, bool discard)
 {
-	struct cache *cache = context;
+	struct discard_load_info *li = context;
 
-	/* FIXME: handle mis-matched block size */
+	li->block_size = discard_block_size;
 
-	if (discard)
-		set_discard(cache, dblock);
-	else
-		clear_discard(cache, dblock);
+	if (discard) {
+		if (from_dblock(dblock) == li->discard_end)
+			/*
+			 * We're already in a discard range, just extend it.
+			 */
+			li->discard_end = li->discard_end + 1ULL;
+
+		else {
+			/*
+			 * Emit the old range and start a new one.
+			 */
+			set_discard_range(li);
+			li->discard_begin = from_dblock(dblock);
+			li->discard_end = li->discard_begin + 1ULL;
+		}
+	} else {
+		set_discard_range(li);
+		li->discard_begin = li->discard_end = 0;
+	}
 
 	return 0;
 }
@@ -2911,11 +2980,22 @@ static int cache_preresume(struct dm_target *ti)
 	}
 
 	if (!cache->loaded_discards) {
-		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
+		struct discard_load_info li;
+
+		/*
+		 * The discard bitset could have been resized, or the
+		 * discard block size changed.  To be safe we start by
+		 * setting every dblock to not discarded.
+		 */
+		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+
+		discard_load_info_init(cache, &li);
+		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
 		if (r) {
 			DMERR("could not load origin discards");
 			return r;
 		}
+		set_discard_range(&li);
 
 		cache->loaded_discards = true;
 	}
-- 
cgit v1.2.3


From 43c32bf2b0c16d292f4f214dfd16f9cb205e4e81 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 25 Nov 2014 13:14:57 +0000
Subject: dm cache: fix a harmless race when working out if a block is
 discarded

It is more correct to hold the cell before checking the discard state.
These flags are only used as hints to the policy so this change will
have negligable effect.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2c66315553f2..161bbd6652f8 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1432,9 +1432,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	dm_oblock_t block = get_bio_block(cache, bio);
 	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
 	struct policy_result lookup_result;
-	bool discarded_block = is_discarded_oblock(cache, block);
 	bool passthrough = passthrough_mode(&cache->features);
-	bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+	bool discarded_block, can_migrate;
 
 	/*
 	 * Check to see if that block is currently migrating.
@@ -1446,6 +1445,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	if (r > 0)
 		return;
 
+	discarded_block = is_discarded_oblock(cache, block);
+	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+
 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
 		       bio, &lookup_result);
 
-- 
cgit v1.2.3


From 2bb812df63bbd246bd39d10f2e810b2a0a59e99e Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 26 Nov 2014 16:07:50 +0000
Subject: dm cache: discard block size must be a multiple of cache block size

Otherwise the cache blocks may span two discard blocks, which we don't
handle when doing the discard lookup.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 161bbd6652f8..fd7f61387283 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2275,9 +2275,8 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
 }
 
 /*
- * We want the discard block size to be a power of two, at least the size
- * of the cache block size, and have no more than 2^14 discard blocks
- * across the origin.
+ * We want the discard block size to be at least the size of the cache
+ * block size and have no more than 2^14 discard blocks across the origin.
  */
 #define MAX_DISCARD_BLOCKS (1 << 14)
 
@@ -2292,9 +2291,7 @@ static bool too_many_discard_blocks(sector_t discard_block_size,
 static sector_t calculate_discard_block_size(sector_t cache_block_size,
 					     sector_t origin_size)
 {
-	sector_t discard_block_size;
-
-	discard_block_size = roundup_pow_of_two(cache_block_size);
+	sector_t discard_block_size = cache_block_size;
 
 	if (origin_size)
 		while (too_many_discard_blocks(discard_block_size, origin_size))
-- 
cgit v1.2.3


From f29a3147e251d7ae20d3194ff67f109d71e501b4 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 27 Nov 2014 12:21:08 +0000
Subject: dm cache: only use overwrite optimisation for promotion when in
 writeback mode

Overwrite causes the cache block and origin blocks to diverge, which
is only allowed in writeback mode.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-cache-target.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index fd7f61387283..ef842feda101 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1142,7 +1142,8 @@ static void issue_copy_or_discard(struct dm_cache_migration *mg)
 
 		avoid = is_discarded_oblock(cache, mg->new_oblock);
 
-		if (!avoid && bio_writes_complete_block(cache, bio)) {
+		if (writeback_mode(&cache->features) &&
+		    !avoid && bio_writes_complete_block(cache, bio)) {
 			issue_overwrite(mg, bio);
 			return;
 		}
-- 
cgit v1.2.3


From 1e32134a5a404e80bfb47fad8a94e9bbfcbdacc5 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 27 Nov 2014 12:26:46 +0000
Subject: dm cache: dirty flag was mistakenly being cleared when promoting via
 overwrite

If the incoming bio is a WRITE and completely covers a block then we
don't bother to do any copying for a promotion operation.  Once this is
done the cache block and origin block will be different, so we need to
set it to 'dirty'.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-cache-target.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ef842feda101..1a090de0c4b8 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -988,10 +988,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
 		}
 
 	} else {
-		clear_dirty(cache, mg->new_oblock, mg->cblock);
-		if (mg->requeue_holder)
+		if (mg->requeue_holder) {
+			clear_dirty(cache, mg->new_oblock, mg->cblock);
 			cell_defer(cache, mg->new_ocell, true);
-		else {
+		} else {
+			/*
+			 * The block was promoted via an overwrite, so it's dirty.
+			 */
+			set_dirty(cache, mg->new_oblock, mg->cblock);
 			bio_endio(mg->new_ocell->holder, 0);
 			cell_defer(cache, mg->new_ocell, false);
 		}
-- 
cgit v1.2.3


From f824a2af3dfbbb766c02e19df21f985bceadf0ee Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 28 Nov 2014 09:48:25 +0000
Subject: dm cache: fix spurious cell_defer when dealing with partial block at
 end of device

We never bother caching a partial block that is at the back end of the
origin device.  No cell ever gets locked, but the calling code was
assuming it was and trying to release it.

Now the code only releases if the cell has been set to a non NULL
value.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-cache-target.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1a090de0c4b8..1e96d7889f51 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2690,11 +2690,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 static int cache_map(struct dm_target *ti, struct bio *bio)
 {
 	int r;
-	struct dm_bio_prison_cell *cell;
+	struct dm_bio_prison_cell *cell = NULL;
 	struct cache *cache = ti->private;
 
 	r = __cache_map(cache, bio, &cell);
-	if (r == DM_MAPIO_REMAPPED) {
+	if (r == DM_MAPIO_REMAPPED && cell) {
 		inc_ds(cache, bio, cell);
 		cell_defer(cache, cell, false);
 	}
-- 
cgit v1.2.3


From 445559cdcb98a141f5de415b94fd6eaccab87e6d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 25 Nov 2014 17:45:15 -0800
Subject: dm bufio: fix memleak when using a dm_buffer's inline bio

When dm-bufio sets out to use the bio built into a struct dm_buffer to
issue an IO, it needs to call bio_reset after it's done with the bio
so that we can free things attached to the bio such as the integrity
payload.  Therefore, inject our own endio callback to take care of
the bio_reset after calling submit_io's end_io callback.

Test case:
1. modprobe scsi_debug delay=0 dif=1 dix=199 ato=1 dev_size_mb=300
2. Set up a dm-bufio client, e.g. dm-verity, on the scsi_debug device
3. Repeatedly read metadata and watch kmalloc-192 leak!

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-bufio.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 99579c81ae0a..9a4b1bf6f3f4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -578,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 		end_io(&b->bio, r);
 }
 
+static void inline_endio(struct bio *bio, int error)
+{
+	bio_end_io_t *end_fn = bio->bi_private;
+
+	/*
+	 * Reset the bio to free any attached resources
+	 * (e.g. bio integrity profiles).
+	 */
+	bio_reset(bio);
+
+	end_fn(bio, error);
+}
+
 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 			   bio_end_io_t *end_io)
 {
@@ -589,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
 	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
 	b->bio.bi_bdev = b->c->bdev;
-	b->bio.bi_end_io = end_io;
+	b->bio.bi_end_io = inline_endio;
+	/*
+	 * Use of .bi_private isn't a problem here because
+	 * the dm_buffer's inline bio is local to bufio.
+	 */
+	b->bio.bi_private = end_io;
 
 	/*
 	 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
-- 
cgit v1.2.3


From c1c6156fe4d4577444b769d7edd5dd503e57bbc9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 29 Nov 2014 15:50:21 +0300
Subject: dm space map metadata: fix sm_bootstrap_get_nr_blocks()

This function isn't right and it causes a static checker warning:

	drivers/md/dm-thin.c:3016 maybe_resize_data_dev()
	error: potentially using uninitialized 'sb_data_size'.

It should set "*count" and return zero on success the same as the
sm_metadata_get_nr_blocks() function does earlier.

Fixes: 3241b1d3e0aa ('dm: add persistent data library')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-space-map-metadata.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 786b689bdfc7..f4e22bcc7fb8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count
 {
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
-	return smm->ll.nr_blocks;
+	*count = smm->ll.nr_blocks;
+
+	return 0;
 }
 
 static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
-- 
cgit v1.2.3


From 02717d9855400c12c6e338ce1f5c2e1310def49a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 1 Dec 2014 14:38:11 +0000
Subject: dm space map metadata: fix sm_bootstrap_get_count()

Must set 'result' accordingly rather than return it.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-space-map-metadata.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index f4e22bcc7fb8..e8a904298887 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -583,7 +583,9 @@ static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
 {
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
-	return b < smm->begin ? 1 : 0;
+	*result = (b < smm->begin) ? 1 : 0;
+
+	return 0;
 }
 
 static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
-- 
cgit v1.2.3


From 1a71d6ffe18c0d0f03fc8531949cc8ed41d702ee Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Sat, 22 Nov 2014 09:36:04 +0100
Subject: dm crypt: use memzero_explicit for on-stack buffer

Use memzero_explicit to cleanup sensitive data allocated on stack
to prevent the compiler from optimizing and removing memset() calls.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cd15e0801228..ce11a90a33c3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -711,7 +711,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
 	for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
 		crypto_xor(data + i * 8, buf, 8);
 out:
-	memset(buf, 0, sizeof(buf));
+	memzero_explicit(buf, sizeof(buf));
 	return r;
 }
 
-- 
cgit v1.2.3