From 988dfbd795cf08b00576c1ced4210281b2bccffc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 10 Mar 2015 09:27:55 +1100
Subject: rhashtable: Move hash_rnd into bucket_table

Currently hash_rnd is a parameter that users can set.  However,
no existing users set this parameter.  It is also something that
people are unlikely to want to set directly since it's just a
random number.

In preparation for allowing the reseeding/rehashing of rhashtable,
this patch moves hash_rnd into bucket_table so that it's now an
internal state rather than a parameter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b5344ef4c684..ba15dceee27f 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -66,25 +66,28 @@ static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 	return hash & (tbl->size - 1);
 }
 
-static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
+static u32 obj_raw_hashfn(struct rhashtable *ht, const void *ptr)
 {
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
 
 	if (unlikely(!ht->p.key_len))
-		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+		hash = ht->p.obj_hashfn(ptr, tbl->hash_rnd);
 	else
 		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
-				    ht->p.hash_rnd);
+				    tbl->hash_rnd);
 
 	return hash >> HASH_RESERVED_SPACE;
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
-	return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
+	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	return ht->p.hashfn(key, len, tbl->hash_rnd) >> HASH_RESERVED_SPACE;
 }
 
-static u32 head_hashfn(const struct rhashtable *ht,
+static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
 {
@@ -92,7 +95,7 @@ static u32 head_hashfn(const struct rhashtable *ht,
 }
 
 #ifdef CONFIG_PROVE_LOCKING
-static void debug_dump_buckets(const struct rhashtable *ht,
+static void debug_dump_buckets(struct rhashtable *ht,
 			       const struct bucket_table *tbl)
 {
 	struct rhash_head *he;
@@ -385,6 +388,8 @@ int rhashtable_expand(struct rhashtable *ht)
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
+	new_tbl->hash_rnd = old_tbl->hash_rnd;
+
 	atomic_inc(&ht->shift);
 
 	/* Make insertions go into the new, empty table right away. Deletions
@@ -476,6 +481,8 @@ int rhashtable_shrink(struct rhashtable *ht)
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
+	new_tbl->hash_rnd = tbl->hash_rnd;
+
 	rcu_assign_pointer(ht->future_tbl, new_tbl);
 	synchronize_rcu();
 
@@ -1099,14 +1106,13 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (tbl == NULL)
 		return -ENOMEM;
 
+	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+
 	atomic_set(&ht->nelems, 0);
 	atomic_set(&ht->shift, ilog2(tbl->size));
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
-	if (!ht->p.hash_rnd)
-		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
-
 	INIT_WORK(&ht->run_work, rht_deferred_worker);
 
 	return 0;
-- 
cgit v1.2.3


From aa34a6cb0478842452bac58edb50d3ef9e178c92 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 11 Mar 2015 09:43:48 +1100
Subject: rhashtable: Add arbitrary rehash function

This patch adds a rehash function that supports the use of any
hash function for the new table.  This is needed to support changing
the random seed value during the lifetime of the hash table.

However for now the random seed value is still constant and the
rehash function is simply used to replace the existing expand/shrink
functions.

[ ASSERT_BUCKET_LOCK() and thus debug_dump_table() +
  debug_dump_buckets() are not longer used, so delete them
  entirely. -DaveM ]

Signed-off-by: Herbert Xu <herbert.xu@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 506 +++++++++++++++++++------------------------------------
 1 file changed, 174 insertions(+), 332 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ba15dceee27f..b1c19c5fb326 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -66,9 +66,9 @@ static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 	return hash & (tbl->size - 1);
 }
 
-static u32 obj_raw_hashfn(struct rhashtable *ht, const void *ptr)
+static u32 obj_raw_hashfn(struct rhashtable *ht,
+			  const struct bucket_table *tbl, const void *ptr)
 {
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
 
 	if (unlikely(!ht->p.key_len))
@@ -80,10 +80,9 @@ static u32 obj_raw_hashfn(struct rhashtable *ht, const void *ptr)
 	return hash >> HASH_RESERVED_SPACE;
 }
 
-static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
+static u32 key_hashfn(struct rhashtable *ht, const struct bucket_table *tbl,
+		      const void *key, u32 len)
 {
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-
 	return ht->p.hashfn(key, len, tbl->hash_rnd) >> HASH_RESERVED_SPACE;
 }
 
@@ -91,60 +90,11 @@ static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
 {
-	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
+	return rht_bucket_index(tbl, obj_raw_hashfn(ht, tbl, rht_obj(ht, he)));
 }
 
 #ifdef CONFIG_PROVE_LOCKING
-static void debug_dump_buckets(struct rhashtable *ht,
-			       const struct bucket_table *tbl)
-{
-	struct rhash_head *he;
-	unsigned int i, hash;
-
-	for (i = 0; i < tbl->size; i++) {
-		pr_warn(" [Bucket %d] ", i);
-		rht_for_each_rcu(he, tbl, i) {
-			hash = head_hashfn(ht, tbl, he);
-			pr_cont("[hash = %#x, lock = %p] ",
-				hash, bucket_lock(tbl, hash));
-		}
-		pr_cont("\n");
-	}
-
-}
-
-static void debug_dump_table(struct rhashtable *ht,
-			     const struct bucket_table *tbl,
-			     unsigned int hash)
-{
-	struct bucket_table *old_tbl, *future_tbl;
-
-	pr_emerg("BUG: lock for hash %#x in table %p not held\n",
-		 hash, tbl);
-
-	rcu_read_lock();
-	future_tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	if (future_tbl != old_tbl) {
-		pr_warn("Future table %p (size: %zd)\n",
-			future_tbl, future_tbl->size);
-		debug_dump_buckets(ht, future_tbl);
-	}
-
-	pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size);
-	debug_dump_buckets(ht, old_tbl);
-
-	rcu_read_unlock();
-}
-
 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
-#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)				\
-	do {								\
-		if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) {	\
-			debug_dump_table(HT, TBL, HASH);		\
-			BUG();						\
-		}							\
-	} while (0)
 
 int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
@@ -161,22 +111,9 @@ int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #else
 #define ASSERT_RHT_MUTEX(HT)
-#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)
 #endif
 
 
-static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
-{
-	struct rhash_head __rcu **pprev;
-
-	for (pprev = &tbl->buckets[n];
-	     !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n));
-	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
-		;
-
-	return pprev;
-}
-
 static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
 {
 	unsigned int i, size;
@@ -270,101 +207,99 @@ static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 	       (atomic_read(&ht->shift) > ht->p.min_shift);
 }
 
-static void lock_buckets(struct bucket_table *new_tbl,
-			 struct bucket_table *old_tbl, unsigned int hash)
-	__acquires(old_bucket_lock)
+static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 {
-	spin_lock_bh(bucket_lock(old_tbl, hash));
-	if (new_tbl != old_tbl)
-		spin_lock_bh_nested(bucket_lock(new_tbl, hash),
-				    RHT_LOCK_NESTED);
-}
+	struct bucket_table *new_tbl = rht_dereference(ht->future_tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
+	int err = -ENOENT;
+	struct rhash_head *head, *next, *entry;
+	spinlock_t *new_bucket_lock;
+	unsigned new_hash;
 
-static void unlock_buckets(struct bucket_table *new_tbl,
-			   struct bucket_table *old_tbl, unsigned int hash)
-	__releases(old_bucket_lock)
-{
-	if (new_tbl != old_tbl)
-		spin_unlock_bh(bucket_lock(new_tbl, hash));
-	spin_unlock_bh(bucket_lock(old_tbl, hash));
-}
+	rht_for_each(entry, old_tbl, old_hash) {
+		err = 0;
+		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
-/**
- * Unlink entries on bucket which hash to different bucket.
- *
- * Returns true if no more work needs to be performed on the bucket.
- */
-static bool hashtable_chain_unzip(struct rhashtable *ht,
-				  const struct bucket_table *new_tbl,
-				  struct bucket_table *old_tbl,
-				  size_t old_hash)
-{
-	struct rhash_head *he, *p, *next;
-	unsigned int new_hash, new_hash2;
+		if (rht_is_a_nulls(next))
+			break;
 
-	ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash);
+		pprev = &entry->next;
+	}
 
-	/* Old bucket empty, no work needed. */
-	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
-				   old_hash);
-	if (rht_is_a_nulls(p))
-		return false;
+	if (err)
+		goto out;
 
-	new_hash = head_hashfn(ht, new_tbl, p);
-	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
+	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	/* Advance the old bucket pointer one or more times until it
-	 * reaches a node that doesn't hash to the same bucket as the
-	 * previous node p. Call the previous node p;
-	 */
-	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
-		new_hash2 = head_hashfn(ht, new_tbl, he);
-		ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
 
-		if (new_hash != new_hash2)
-			break;
-		p = he;
-	}
-	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
+	spin_lock(new_bucket_lock);
+	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
+				      new_tbl, new_hash);
 
-	/* Find the subsequent node which does hash to the same
-	 * bucket as node P, or NULL if no such node exists.
-	 */
-	INIT_RHT_NULLS_HEAD(next, ht, old_hash);
-	if (!rht_is_a_nulls(he)) {
-		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
-			if (head_hashfn(ht, new_tbl, he) == new_hash) {
-				next = he;
-				break;
-			}
-		}
-	}
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(entry->next, ht, new_hash);
+	else
+		RCU_INIT_POINTER(entry->next, head);
 
-	/* Set p's next pointer to that subsequent node pointer,
-	 * bypassing the nodes which do not hash to p's bucket
-	 */
-	rcu_assign_pointer(p->next, next);
+	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+	spin_unlock(new_bucket_lock);
 
-	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
-				   old_hash);
+	rcu_assign_pointer(*pprev, next);
 
-	return !rht_is_a_nulls(p);
+out:
+	return err;
 }
 
-static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
-			    unsigned int new_hash, struct rhash_head *entry)
+static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
 {
-	ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *old_bucket_lock;
+
+	old_bucket_lock = bucket_lock(old_tbl, old_hash);
 
-	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
+	spin_lock_bh(old_bucket_lock);
+	while (!rhashtable_rehash_one(ht, old_hash))
+		;
+	spin_unlock_bh(old_bucket_lock);
+}
+
+static void rhashtable_rehash(struct rhashtable *ht,
+			      struct bucket_table *new_tbl)
+{
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	unsigned old_hash;
+
+	get_random_bytes(&new_tbl->hash_rnd, sizeof(new_tbl->hash_rnd));
+
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 * The synchronize_rcu() guarantees for the new table to be picked up
+	 * so no new additions go into the old table while we relink.
+	 */
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
+		rhashtable_rehash_chain(ht, old_hash);
+
+	/* Publish the new table pointer. */
+	rcu_assign_pointer(ht->tbl, new_tbl);
+
+	/* Wait for readers. All new readers will see the new
+	 * table, and thus no references to the old table will
+	 * remain.
+	 */
+	synchronize_rcu();
+
+	bucket_table_free(old_tbl);
 }
 
 /**
  * rhashtable_expand - Expand hash table while allowing concurrent lookups
  * @ht:		the hash table to expand
  *
- * A secondary bucket array is allocated and the hash entries are migrated
- * while keeping them on both lists until the end of the RCU grace period.
+ * A secondary bucket array is allocated and the hash entries are migrated.
  *
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
@@ -378,9 +313,6 @@ static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
 int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
-	struct rhash_head *he;
-	unsigned int new_hash, old_hash;
-	bool complete = false;
 
 	ASSERT_RHT_MUTEX(ht);
 
@@ -392,64 +324,8 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	atomic_inc(&ht->shift);
 
-	/* Make insertions go into the new, empty table right away. Deletions
-	 * and lookups will be attempted in both tables until we synchronize.
-	 * The synchronize_rcu() guarantees for the new table to be picked up
-	 * so no new additions go into the old table while we relink.
-	 */
-	rcu_assign_pointer(ht->future_tbl, new_tbl);
-	synchronize_rcu();
+	rhashtable_rehash(ht, new_tbl);
 
-	/* For each new bucket, search the corresponding old bucket for the
-	 * first entry that hashes to the new bucket, and link the end of
-	 * newly formed bucket chain (containing entries added to future
-	 * table) to that entry. Since all the entries which will end up in
-	 * the new bucket appear in the same old bucket, this constructs an
-	 * entirely valid new hash table, but with multiple buckets
-	 * "zipped" together into a single imprecise chain.
-	 */
-	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
-		old_hash = rht_bucket_index(old_tbl, new_hash);
-		lock_buckets(new_tbl, old_tbl, new_hash);
-		rht_for_each(he, old_tbl, old_hash) {
-			if (head_hashfn(ht, new_tbl, he) == new_hash) {
-				link_old_to_new(ht, new_tbl, new_hash, he);
-				break;
-			}
-		}
-		unlock_buckets(new_tbl, old_tbl, new_hash);
-		cond_resched();
-	}
-
-	/* Unzip interleaved hash chains */
-	while (!complete && !ht->being_destroyed) {
-		/* Wait for readers. All new readers will see the new
-		 * table, and thus no references to the old table will
-		 * remain.
-		 */
-		synchronize_rcu();
-
-		/* For each bucket in the old table (each of which
-		 * contains items from multiple buckets of the new
-		 * table): ...
-		 */
-		complete = true;
-		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
-			lock_buckets(new_tbl, old_tbl, old_hash);
-
-			if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
-						  old_hash))
-				complete = false;
-
-			unlock_buckets(new_tbl, old_tbl, old_hash);
-			cond_resched();
-		}
-	}
-
-	rcu_assign_pointer(ht->tbl, new_tbl);
-	synchronize_rcu();
-
-	bucket_table_free(old_tbl);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_expand);
@@ -473,7 +349,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
-	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
 
@@ -483,39 +358,9 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	new_tbl->hash_rnd = tbl->hash_rnd;
 
-	rcu_assign_pointer(ht->future_tbl, new_tbl);
-	synchronize_rcu();
-
-	/* Link the first entry in the old bucket to the end of the
-	 * bucket in the new table. As entries are concurrently being
-	 * added to the new table, lock down the new bucket. As we
-	 * always divide the size in half when shrinking, each bucket
-	 * in the new table maps to exactly two buckets in the old
-	 * table.
-	 */
-	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
-		lock_buckets(new_tbl, tbl, new_hash);
-
-		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
-				   tbl->buckets[new_hash]);
-		ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size);
-		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
-				   tbl->buckets[new_hash + new_tbl->size]);
-
-		unlock_buckets(new_tbl, tbl, new_hash);
-		cond_resched();
-	}
-
-	/* Publish the new, valid hash table */
-	rcu_assign_pointer(ht->tbl, new_tbl);
 	atomic_dec(&ht->shift);
 
-	/* Wait for readers. No new readers will have references to the
-	 * old hash table.
-	 */
-	synchronize_rcu();
-
-	bucket_table_free(tbl);
+	rhashtable_rehash(ht, new_tbl);
 
 	return 0;
 }
@@ -545,18 +390,46 @@ unlock:
 	mutex_unlock(&ht->mutex);
 }
 
-static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
-				struct bucket_table *tbl,
-				const struct bucket_table *old_tbl, u32 hash)
+static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
+				bool (*compare)(void *, void *), void *arg)
 {
-	bool no_resize_running = tbl == old_tbl;
+	struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *head;
+	bool no_resize_running;
+	unsigned hash;
+	bool success = true;
+
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = obj_raw_hashfn(ht, old_tbl, rht_obj(ht, obj));
+
+	spin_lock_bh(bucket_lock(old_tbl, hash));
+
+	/* Because we have already taken the bucket lock in old_tbl,
+	 * if we find that future_tbl is not yet visible then that
+	 * guarantees all other insertions of the same entry will
+	 * also grab the bucket lock in old_tbl because until the
+	 * rehash completes ht->tbl won't be changed.
+	 */
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	if (tbl != old_tbl) {
+		hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+		spin_lock(bucket_lock(tbl, hash));
+	}
+
+	if (compare &&
+	    rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
+				      compare, arg)) {
+		success = false;
+		goto exit;
+	}
+
+	no_resize_running = tbl == old_tbl;
 
 	hash = rht_bucket_index(tbl, hash);
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
-	ASSERT_BUCKET_LOCK(ht, tbl, hash);
-
 	if (rht_is_a_nulls(head))
 		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
 	else
@@ -567,6 +440,19 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	atomic_inc(&ht->nelems);
 	if (no_resize_running && rht_grow_above_75(ht, tbl->size))
 		schedule_work(&ht->run_work);
+
+exit:
+	if (tbl != old_tbl) {
+		hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+		spin_unlock(bucket_lock(tbl, hash));
+	}
+
+	hash = obj_raw_hashfn(ht, old_tbl, rht_obj(ht, obj));
+	spin_unlock_bh(bucket_lock(old_tbl, hash));
+
+	rcu_read_unlock();
+
+	return success;
 }
 
 /**
@@ -586,22 +472,42 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl, *old_tbl;
+	__rhashtable_insert(ht, obj, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert);
+
+static bool __rhashtable_remove(struct rhashtable *ht,
+				struct bucket_table *tbl,
+				struct rhash_head *obj)
+{
+	struct rhash_head __rcu **pprev;
+	struct rhash_head *he;
+	spinlock_t * lock;
 	unsigned hash;
+	bool ret = false;
 
-	rcu_read_lock();
+	hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+	lock = bucket_lock(tbl, hash);
+	hash = rht_bucket_index(tbl, hash);
 
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
+	spin_lock_bh(lock);
 
-	lock_buckets(tbl, old_tbl, hash);
-	__rhashtable_insert(ht, obj, tbl, old_tbl, hash);
-	unlock_buckets(tbl, old_tbl, hash);
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
+		if (he != obj) {
+			pprev = &he->next;
+			continue;
+		}
 
-	rcu_read_unlock();
+		rcu_assign_pointer(*pprev, obj->next);
+		ret = true;
+		break;
+	}
+
+	spin_unlock_bh(lock);
+
+	return ret;
 }
-EXPORT_SYMBOL_GPL(rhashtable_insert);
 
 /**
  * rhashtable_remove - remove object from hash table
@@ -620,68 +526,28 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl, *new_tbl, *old_tbl;
-	struct rhash_head __rcu **pprev;
-	struct rhash_head *he, *he2;
-	unsigned int hash, new_hash;
-	bool ret = false;
+	struct bucket_table *tbl, *old_tbl;
+	bool ret;
 
 	rcu_read_lock();
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
-
-	lock_buckets(new_tbl, old_tbl, new_hash);
-restart:
-	hash = rht_bucket_index(tbl, new_hash);
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
-		if (he != obj) {
-			pprev = &he->next;
-			continue;
-		}
 
-		ASSERT_BUCKET_LOCK(ht, tbl, hash);
-
-		if (old_tbl->size > new_tbl->size && tbl == old_tbl &&
-		    !rht_is_a_nulls(obj->next) &&
-		    head_hashfn(ht, tbl, obj->next) != hash) {
-			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
-		} else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) {
-			rht_for_each_continue(he2, obj->next, tbl, hash) {
-				if (head_hashfn(ht, tbl, he2) == hash) {
-					rcu_assign_pointer(*pprev, he2);
-					goto found;
-				}
-			}
-
-			rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
-		} else {
-			rcu_assign_pointer(*pprev, obj->next);
-		}
-
-found:
-		ret = true;
-		break;
-	}
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	ret = __rhashtable_remove(ht, old_tbl, obj);
 
-	/* The entry may be linked in either 'tbl', 'future_tbl', or both.
-	 * 'future_tbl' only exists for a short period of time during
-	 * resizing. Thus traversing both is fine and the added cost is
-	 * very rare.
+	/* Because we have already taken (and released) the bucket
+	 * lock in old_tbl, if we find that future_tbl is not yet
+	 * visible then that guarantees the entry to still be in
+	 * old_tbl if it exists.
 	 */
-	if (tbl != old_tbl) {
-		tbl = old_tbl;
-		goto restart;
-	}
-
-	unlock_buckets(new_tbl, old_tbl, new_hash);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	if (!ret && old_tbl != tbl)
+		ret = __rhashtable_remove(ht, tbl, obj);
 
 	if (ret) {
-		bool no_resize_running = new_tbl == old_tbl;
+		bool no_resize_running = tbl == old_tbl;
 
 		atomic_dec(&ht->nelems);
-		if (no_resize_running && rht_shrink_below_30(ht, new_tbl->size))
+		if (no_resize_running && rht_shrink_below_30(ht, tbl->size))
 			schedule_work(&ht->run_work);
 	}
 
@@ -753,9 +619,8 @@ void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 
 	rcu_read_lock();
 
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	hash = key_hashfn(ht, key, ht->p.key_len);
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = key_hashfn(ht, tbl, key, ht->p.key_len);
 restart:
 	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (!compare(rht_obj(ht, he), arg))
@@ -764,10 +629,10 @@ restart:
 		return rht_obj(ht, he);
 	}
 
-	if (unlikely(tbl != old_tbl)) {
-		tbl = old_tbl;
+	old_tbl = tbl;
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	if (unlikely(tbl != old_tbl))
 		goto restart;
-	}
 	rcu_read_unlock();
 
 	return NULL;
@@ -833,32 +698,9 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 				      bool (*compare)(void *, void *),
 				      void *arg)
 {
-	struct bucket_table *new_tbl, *old_tbl;
-	u32 new_hash;
-	bool success = true;
-
 	BUG_ON(!ht->p.key_len);
 
-	rcu_read_lock();
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
-
-	lock_buckets(new_tbl, old_tbl, new_hash);
-
-	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
-				      compare, arg)) {
-		success = false;
-		goto exit;
-	}
-
-	__rhashtable_insert(ht, obj, new_tbl, old_tbl, new_hash);
-
-exit:
-	unlock_buckets(new_tbl, old_tbl, new_hash);
-	rcu_read_unlock();
-
-	return success;
+	return __rhashtable_insert(ht, obj, compare, arg);
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
-- 
cgit v1.2.3


From 84ed82b74dcb23d96cee2987612a677ffd2b5470 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 14:47:13 +1100
Subject: rhashtable: Add annotation to nested lock

Commit aa34a6cb0478842452bac58edb50d3ef9e178c92 ("rhashtable:
Add arbitrary rehash function") killed the annotation on the
nested lock which leads to bitching from lockdep.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b1c19c5fb326..d7f3db57b5d0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -234,7 +234,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 
 	new_bucket_lock = bucket_lock(new_tbl, new_hash);
 
-	spin_lock(new_bucket_lock);
+	spin_lock_nested(new_bucket_lock, RHT_LOCK_NESTED);
 	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
 				      new_tbl, new_hash);
 
@@ -415,7 +415,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	if (tbl != old_tbl) {
 		hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
-		spin_lock(bucket_lock(tbl, hash));
+		spin_lock_nested(bucket_lock(tbl, hash), RHT_LOCK_NESTED);
 	}
 
 	if (compare &&
-- 
cgit v1.2.3


From 8d2b18793d16e4186f00b07d031a25537c4cefb9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 14:49:38 +1100
Subject: rhashtable: Move masking back into key_hashfn

This patch reverts commit c88455ce50ae4224d84960ce2baa53e61580df27
("rhashtable: key_hashfn() must return full hash value") because
the only user of it always masks the hash value.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d7f3db57b5d0..ff9cc3386fc9 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -83,7 +83,8 @@ static u32 obj_raw_hashfn(struct rhashtable *ht,
 static u32 key_hashfn(struct rhashtable *ht, const struct bucket_table *tbl,
 		      const void *key, u32 len)
 {
-	return ht->p.hashfn(key, len, tbl->hash_rnd) >> HASH_RESERVED_SPACE;
+	return rht_bucket_index(tbl, ht->p.hashfn(key, len, tbl->hash_rnd) >>
+				     HASH_RESERVED_SPACE);
 }
 
 static u32 head_hashfn(struct rhashtable *ht,
@@ -622,7 +623,7 @@ void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = key_hashfn(ht, tbl, key, ht->p.key_len);
 restart:
-	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
+	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
 		rcu_read_unlock();
-- 
cgit v1.2.3


From eca849333017cab1cd02c8fc9187962fa629b27d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 14:49:39 +1100
Subject: rhashtable: Use head_hashfn instead of obj_raw_hashfn

Now that we don't have cross-table hashes, we no longer need to
keep the entire hash value so all users of obj_raw_hashfn can
use head_hashfn instead.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ff9cc3386fc9..03fdaf869c4d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -403,7 +403,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	rcu_read_lock();
 
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = obj_raw_hashfn(ht, old_tbl, rht_obj(ht, obj));
+	hash = head_hashfn(ht, old_tbl, obj);
 
 	spin_lock_bh(bucket_lock(old_tbl, hash));
 
@@ -415,7 +415,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	 */
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	if (tbl != old_tbl) {
-		hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+		hash = head_hashfn(ht, tbl, obj);
 		spin_lock_nested(bucket_lock(tbl, hash), RHT_LOCK_NESTED);
 	}
 
@@ -428,7 +428,6 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 
 	no_resize_running = tbl == old_tbl;
 
-	hash = rht_bucket_index(tbl, hash);
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 
 	if (rht_is_a_nulls(head))
@@ -444,11 +443,11 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 
 exit:
 	if (tbl != old_tbl) {
-		hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+		hash = head_hashfn(ht, tbl, obj);
 		spin_unlock(bucket_lock(tbl, hash));
 	}
 
-	hash = obj_raw_hashfn(ht, old_tbl, rht_obj(ht, obj));
+	hash = head_hashfn(ht, old_tbl, obj);
 	spin_unlock_bh(bucket_lock(old_tbl, hash));
 
 	rcu_read_unlock();
@@ -487,9 +486,8 @@ static bool __rhashtable_remove(struct rhashtable *ht,
 	unsigned hash;
 	bool ret = false;
 
-	hash = obj_raw_hashfn(ht, tbl, rht_obj(ht, obj));
+	hash = head_hashfn(ht, tbl, obj);
 	lock = bucket_lock(tbl, hash);
-	hash = rht_bucket_index(tbl, hash);
 
 	spin_lock_bh(lock);
 
-- 
cgit v1.2.3


From cffaa9cb922472936b269017afdd3f147cb6f380 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 14:49:40 +1100
Subject: rhashtable: Remove key length argument to key_hashfn

key_hashfn has only one caller and it doesn't really need to supply
the key length as an extra parameter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 03fdaf869c4d..838cccc4ef7e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -81,9 +81,10 @@ static u32 obj_raw_hashfn(struct rhashtable *ht,
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const struct bucket_table *tbl,
-		      const void *key, u32 len)
+		      const void *key)
 {
-	return rht_bucket_index(tbl, ht->p.hashfn(key, len, tbl->hash_rnd) >>
+	return rht_bucket_index(tbl, ht->p.hashfn(key, ht->p.key_len,
+						  tbl->hash_rnd) >>
 				     HASH_RESERVED_SPACE);
 }
 
@@ -619,7 +620,7 @@ void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = key_hashfn(ht, tbl, key, ht->p.key_len);
+	hash = key_hashfn(ht, tbl, key);
 restart:
 	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
-- 
cgit v1.2.3


From ec9f71c59e00388efc1337307511b59cc4c48394 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 14:49:41 +1100
Subject: rhashtable: Remove obj_raw_hashfn

Now that the only caller of obj_raw_hashfn is head_hashfn, we can
simply kill it and fold it into the latter.

This patch also moves the common shift from head_hashfn/key_hashfn
into rht_bucket_index.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 838cccc4ef7e..6ffc793145f3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -63,36 +63,25 @@ static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 
 static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 {
-	return hash & (tbl->size - 1);
-}
-
-static u32 obj_raw_hashfn(struct rhashtable *ht,
-			  const struct bucket_table *tbl, const void *ptr)
-{
-	u32 hash;
-
-	if (unlikely(!ht->p.key_len))
-		hash = ht->p.obj_hashfn(ptr, tbl->hash_rnd);
-	else
-		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
-				    tbl->hash_rnd);
-
-	return hash >> HASH_RESERVED_SPACE;
+	return (hash >> HASH_RESERVED_SPACE) & (tbl->size - 1);
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const struct bucket_table *tbl,
 		      const void *key)
 {
 	return rht_bucket_index(tbl, ht->p.hashfn(key, ht->p.key_len,
-						  tbl->hash_rnd) >>
-				     HASH_RESERVED_SPACE);
+						  tbl->hash_rnd));
 }
 
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
 {
-	return rht_bucket_index(tbl, obj_raw_hashfn(ht, tbl, rht_obj(ht, he)));
+	const char *ptr = rht_obj(ht, he);
+
+	return likely(ht->p.key_len) ?
+	       key_hashfn(ht, tbl, ptr + ht->p.key_offset) :
+	       rht_bucket_index(tbl, ht->p.obj_hashfn(ptr, tbl->hash_rnd));
 }
 
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.2.3


From 9497df88ab5567daa001829051c5f87161a81ff0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 12 Mar 2015 22:07:49 +1100
Subject: rhashtable: Fix reader/rehash race

There is a potential race condition between readers and the rehasher.
In particular, the rehasher could have started a rehash while the
reader finishes a scan of the old table but fails to see the new
table pointer.

This patch closes this window by adding smp_wmb/smp_rmb.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6ffc793145f3..68210cc2bab8 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -271,6 +271,9 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	 */
 	rcu_assign_pointer(ht->future_tbl, new_tbl);
 
+	/* Ensure the new table is visible to readers. */
+	smp_wmb();
+
 	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
 		rhashtable_rehash_chain(ht, old_hash);
 
@@ -618,6 +621,9 @@ restart:
 		return rht_obj(ht, he);
 	}
 
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
 	old_tbl = tbl;
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	if (unlikely(tbl != old_tbl))
-- 
cgit v1.2.3


From a5b6846f9e1a080493210013385c28faecee36f0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Mar 2015 15:28:40 +0100
Subject: rhashtable: kill ht->shift atomic operations

Commit c0c09bfdc415 ("rhashtable: avoid unnecessary wakeup for worker
queue") changed ht->shift to be atomic, which is actually unnecessary.

Instead of leaving the current shift in the core rhashtable structure,
it can be cached inside the individual bucket tables.

There, it will only be initialized once during a new table allocation
in the shrink/expansion slow path, and from then onward it stays immutable
for the rest of the bucket table liftime.

That allows shift to be non-atomic. The patch also moves hash_rnd
management into the table setup. The rhashtable structure now consumes
3 instead of 4 cachelines.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ying Xue <ying.xue@windriver.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 55 +++++++++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 68210cc2bab8..adea791ea3ab 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -147,7 +147,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 }
 
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
-					       size_t nbuckets)
+					       size_t nbuckets, u32 hash_rnd)
 {
 	struct bucket_table *tbl = NULL;
 	size_t size;
@@ -162,6 +162,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 
 	tbl->size = nbuckets;
+	tbl->shift = ilog2(nbuckets);
+	tbl->hash_rnd = hash_rnd;
 
 	if (alloc_bucket_locks(ht, tbl) < 0) {
 		bucket_table_free(tbl);
@@ -177,25 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 /**
  * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
  * @ht:		hash table
- * @new_size:	new table size
+ * @tbl:	current table
  */
-static bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
+static bool rht_grow_above_75(const struct rhashtable *ht,
+			      const struct bucket_table *tbl)
 {
 	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (new_size / 4 * 3) &&
-	       (!ht->p.max_shift || atomic_read(&ht->shift) < ht->p.max_shift);
+	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
+	       (!ht->p.max_shift || tbl->shift < ht->p.max_shift);
 }
 
 /**
  * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
  * @ht:		hash table
- * @new_size:	new table size
+ * @tbl:	current table
  */
-static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
+static bool rht_shrink_below_30(const struct rhashtable *ht,
+				const struct bucket_table *tbl)
 {
 	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (new_size * 3 / 10) &&
-	       (atomic_read(&ht->shift) > ht->p.min_shift);
+	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
+	       tbl->shift > ht->p.min_shift;
 }
 
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
@@ -310,16 +314,11 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, old_tbl->hash_rnd);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	new_tbl->hash_rnd = old_tbl->hash_rnd;
-
-	atomic_inc(&ht->shift);
-
 	rhashtable_rehash(ht, new_tbl);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_expand);
@@ -342,20 +341,15 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  */
 int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size / 2, old_tbl->hash_rnd);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	new_tbl->hash_rnd = tbl->hash_rnd;
-
-	atomic_dec(&ht->shift);
-
 	rhashtable_rehash(ht, new_tbl);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_shrink);
@@ -376,9 +370,9 @@ static void rht_deferred_worker(struct work_struct *work)
 	list_for_each_entry(walker, &ht->walkers, list)
 		walker->resize = true;
 
-	if (rht_grow_above_75(ht, tbl->size))
+	if (rht_grow_above_75(ht, tbl))
 		rhashtable_expand(ht);
-	else if (rht_shrink_below_30(ht, tbl->size))
+	else if (rht_shrink_below_30(ht, tbl))
 		rhashtable_shrink(ht);
 unlock:
 	mutex_unlock(&ht->mutex);
@@ -431,7 +425,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 
 	atomic_inc(&ht->nelems);
-	if (no_resize_running && rht_grow_above_75(ht, tbl->size))
+	if (no_resize_running && rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
 
 exit:
@@ -539,7 +533,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 		bool no_resize_running = tbl == old_tbl;
 
 		atomic_dec(&ht->nelems);
-		if (no_resize_running && rht_shrink_below_30(ht, tbl->size))
+		if (no_resize_running && rht_shrink_below_30(ht, tbl))
 			schedule_work(&ht->run_work);
 	}
 
@@ -913,6 +907,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
 	size_t size;
+	u32 hash_rnd;
 
 	size = HASH_DEFAULT_SIZE;
 
@@ -939,14 +934,14 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	else
 		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
 
-	tbl = bucket_table_alloc(ht, size);
+	get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+
+	tbl = bucket_table_alloc(ht, size, hash_rnd);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-
 	atomic_set(&ht->nelems, 0);
-	atomic_set(&ht->shift, ilog2(tbl->size));
+
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
-- 
cgit v1.2.3


From 393619474ec0ba2a16dee12ec78fd43164f1e9b7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 13 Mar 2015 12:54:10 +1100
Subject: rhashtable: Fix read-side crash during rehash

This patch fixes a typo rhashtable_lookup_compare where we fail
to recompute the hash when looking up the new table.  This causes
elements to be missed and potentially a crash during a resize.

Reported-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index adea791ea3ab..fc0d451279f0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -606,8 +606,8 @@ void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 	rcu_read_lock();
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = key_hashfn(ht, tbl, key);
 restart:
+	hash = key_hashfn(ht, tbl, key);
 	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
-- 
cgit v1.2.3


From eddee5ba34eb6c9890ef106f19ead2b370e5342f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:20 +1100
Subject: rhashtable: Fix walker behaviour during rehash

Previously whenever the walker encountered a resize it simply
snaps back to the beginning and starts again.  However, this only
works if the rehash started and completed while the walker was
idle.

If the walker attempts to restart while the rehash is still ongoing,
we may miss objects that we shouldn't have.

This patch fixes this by making the walker walk the old table
followed by the new table just like all other readers.  If a
rehash is detected we will still signal our caller of the fact
so they can prepare for duplicates but we will simply continue
the walk onto the new table after the old one is finished either
by us or by the rehasher.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 69 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 23 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index fc0d451279f0..f7c76079f8f1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -170,6 +170,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 	}
 
+	INIT_LIST_HEAD(&tbl->walkers);
+
 	for (i = 0; i < nbuckets; i++)
 		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
 
@@ -264,6 +266,7 @@ static void rhashtable_rehash(struct rhashtable *ht,
 			      struct bucket_table *new_tbl)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct rhashtable_walker *walker;
 	unsigned old_hash;
 
 	get_random_bytes(&new_tbl->hash_rnd, sizeof(new_tbl->hash_rnd));
@@ -284,6 +287,9 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
+	list_for_each_entry(walker, &old_tbl->walkers, list)
+		walker->tbl = NULL;
+
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
 	 * remain.
@@ -358,7 +364,6 @@ static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
-	struct rhashtable_walker *walker;
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
@@ -367,9 +372,6 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	tbl = rht_dereference(ht->tbl, ht);
 
-	list_for_each_entry(walker, &ht->walkers, list)
-		walker->resize = true;
-
 	if (rht_grow_above_75(ht, tbl))
 		rhashtable_expand(ht);
 	else if (rht_shrink_below_30(ht, tbl))
@@ -725,11 +727,9 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
 	if (!iter->walker)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&iter->walker->list);
-	iter->walker->resize = false;
-
 	mutex_lock(&ht->mutex);
-	list_add(&iter->walker->list, &ht->walkers);
+	iter->walker->tbl = rht_dereference(ht->tbl, ht);
+	list_add(&iter->walker->list, &iter->walker->tbl->walkers);
 	mutex_unlock(&ht->mutex);
 
 	return 0;
@@ -745,7 +745,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
 	mutex_lock(&iter->ht->mutex);
-	list_del(&iter->walker->list);
+	if (iter->walker->tbl)
+		list_del(&iter->walker->list);
 	mutex_unlock(&iter->ht->mutex);
 	kfree(iter->walker);
 }
@@ -767,12 +768,19 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
  */
 int rhashtable_walk_start(struct rhashtable_iter *iter)
 {
+	struct rhashtable *ht = iter->ht;
+
+	mutex_lock(&ht->mutex);
+
+	if (iter->walker->tbl)
+		list_del(&iter->walker->list);
+
 	rcu_read_lock();
 
-	if (iter->walker->resize) {
-		iter->slot = 0;
-		iter->skip = 0;
-		iter->walker->resize = false;
+	mutex_unlock(&ht->mutex);
+
+	if (!iter->walker->tbl) {
+		iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
 		return -EAGAIN;
 	}
 
@@ -794,13 +802,11 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start);
  */
 void *rhashtable_walk_next(struct rhashtable_iter *iter)
 {
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl = iter->walker->tbl;
 	struct rhashtable *ht = iter->ht;
 	struct rhash_head *p = iter->p;
 	void *obj = NULL;
 
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-
 	if (p) {
 		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
 		goto next;
@@ -826,17 +832,18 @@ next:
 		iter->skip = 0;
 	}
 
-	iter->p = NULL;
-
-out:
-	if (iter->walker->resize) {
-		iter->p = NULL;
+	iter->walker->tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	if (iter->walker->tbl != tbl) {
 		iter->slot = 0;
 		iter->skip = 0;
-		iter->walker->resize = false;
 		return ERR_PTR(-EAGAIN);
 	}
 
+	iter->walker->tbl = NULL;
+	iter->p = NULL;
+
+out:
+
 	return obj;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
@@ -849,7 +856,24 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next);
  */
 void rhashtable_walk_stop(struct rhashtable_iter *iter)
 {
+	struct rhashtable *ht;
+	struct bucket_table *tbl = iter->walker->tbl;
+
 	rcu_read_unlock();
+
+	if (!tbl)
+		return;
+
+	ht = iter->ht;
+
+	mutex_lock(&ht->mutex);
+	if (rht_dereference(ht->tbl, ht) == tbl ||
+	    rht_dereference(ht->future_tbl, ht) == tbl)
+		list_add(&iter->walker->list, &tbl->walkers);
+	else
+		iter->walker->tbl = NULL;
+	mutex_unlock(&ht->mutex);
+
 	iter->p = NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
@@ -927,7 +951,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	memcpy(&ht->p, params, sizeof(*params));
-	INIT_LIST_HEAD(&ht->walkers);
 
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-- 
cgit v1.2.3


From 8f2484bdb55daa53ecaddb5fa4c298e3d262b69e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:21 +1100
Subject: rhashtable: Use SINGLE_DEPTH_NESTING

We only nest one level deep there is no need to roll our own
subclasses.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f7c76079f8f1..5d06cc2b1e4a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -33,11 +33,6 @@
 /* Base bits plus 1 bit for nulls marker */
 #define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
-enum {
-	RHT_LOCK_NORMAL,
-	RHT_LOCK_NESTED,
-};
-
 /* The bucket lock is selected based on the hash and protects mutations
  * on a group of hash buckets.
  *
@@ -231,7 +226,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 
 	new_bucket_lock = bucket_lock(new_tbl, new_hash);
 
-	spin_lock_nested(new_bucket_lock, RHT_LOCK_NESTED);
+	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
 	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
 				      new_tbl, new_hash);
 
@@ -405,7 +400,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	if (tbl != old_tbl) {
 		hash = head_hashfn(ht, tbl, obj);
-		spin_lock_nested(bucket_lock(tbl, hash), RHT_LOCK_NESTED);
+		spin_lock_nested(bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 	}
 
 	if (compare &&
-- 
cgit v1.2.3


From 5269b53da4d432b0fbf755bd423c807bf6bd4aa0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:22 +1100
Subject: rhashtable: Move seed init into bucket_table_alloc

It seems that I have already made every rehash redo the random
seed even though my commit message indicated otherwise :)

Since we have already taken that step, this patch goes one step
further and moves the seed initialisation into bucket_table_alloc.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5d06cc2b1e4a..e55bbc84c449 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -142,7 +142,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 }
 
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
-					       size_t nbuckets, u32 hash_rnd)
+					       size_t nbuckets)
 {
 	struct bucket_table *tbl = NULL;
 	size_t size;
@@ -158,7 +158,6 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 
 	tbl->size = nbuckets;
 	tbl->shift = ilog2(nbuckets);
-	tbl->hash_rnd = hash_rnd;
 
 	if (alloc_bucket_locks(ht, tbl) < 0) {
 		bucket_table_free(tbl);
@@ -167,6 +166,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 
 	INIT_LIST_HEAD(&tbl->walkers);
 
+	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+
 	for (i = 0; i < nbuckets; i++)
 		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
 
@@ -264,8 +265,6 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	struct rhashtable_walker *walker;
 	unsigned old_hash;
 
-	get_random_bytes(&new_tbl->hash_rnd, sizeof(new_tbl->hash_rnd));
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
 	 * The synchronize_rcu() guarantees for the new table to be picked up
@@ -315,7 +314,7 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, old_tbl->hash_rnd);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -346,7 +345,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size / 2, old_tbl->hash_rnd);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size / 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -926,7 +925,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
 	size_t size;
-	u32 hash_rnd;
 
 	size = HASH_DEFAULT_SIZE;
 
@@ -952,9 +950,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	else
 		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
 
-	get_random_bytes(&hash_rnd, sizeof(hash_rnd));
-
-	tbl = bucket_table_alloc(ht, size, hash_rnd);
+	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 9d901bc05153bbf33b5da2cd6266865e531f0545 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:23 +1100
Subject: rhashtable: Free bucket tables asynchronously after rehash

There is in fact no need to wait for an RCU grace period in the
rehash function, since all insertions are guaranteed to go into
the new table through spin locks.

This patch uses call_rcu to free the old/rehashed table at our
leisure.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e55bbc84c449..36fb0910bec2 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -141,6 +141,11 @@ static void bucket_table_free(const struct bucket_table *tbl)
 	kvfree(tbl);
 }
 
+static void bucket_table_free_rcu(struct rcu_head *head)
+{
+	bucket_table_free(container_of(head, struct bucket_table, rcu));
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets)
 {
@@ -288,9 +293,7 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	 * table, and thus no references to the old table will
 	 * remain.
 	 */
-	synchronize_rcu();
-
-	bucket_table_free(old_tbl);
+	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
 }
 
 /**
-- 
cgit v1.2.3


From 63d512d0cffcae40505d9448abd509972465e846 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:24 +1100
Subject: rhashtable: Add rehash counter to bucket_table

This patch adds a rehash counter to bucket_table to indicate
the last bucket that has been rehashed.  This serves two purposes:

1. Any bucket that has been rehashed can never gain a new object.
2. If the rehash counter reaches the size of the table, the table
will forever remain empty.

This patch also downsizes bucket_table->size to an unsigned int
since we do not support sizes greater than 32 bits yet.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 36fb0910bec2..ff4ea1704546 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -260,6 +260,7 @@ static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
 	spin_lock_bh(old_bucket_lock);
 	while (!rhashtable_rehash_one(ht, old_hash))
 		;
+	old_tbl->rehash++;
 	spin_unlock_bh(old_bucket_lock);
 }
 
-- 
cgit v1.2.3


From c4db8848af6af92f90462258603be844baeab44d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 14 Mar 2015 13:57:25 +1100
Subject: rhashtable: Move future_tbl into struct bucket_table

This patch moves future_tbl to open up the possibility of having
multiple rehashes on the same table.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ff4ea1704546..9d53a46dcca9 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -207,8 +207,9 @@ static bool rht_shrink_below_30(const struct rhashtable *ht,
 
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 {
-	struct bucket_table *new_tbl = rht_dereference(ht->future_tbl, ht);
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl =
+		rht_dereference(old_tbl->future_tbl, ht) ?: old_tbl;
 	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
 	int err = -ENOENT;
 	struct rhash_head *head, *next, *entry;
@@ -273,10 +274,8 @@ static void rhashtable_rehash(struct rhashtable *ht,
 
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
-	 * The synchronize_rcu() guarantees for the new table to be picked up
-	 * so no new additions go into the old table while we relink.
 	 */
-	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
 	/* Ensure the new table is visible to readers. */
 	smp_wmb();
@@ -400,7 +399,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	 * also grab the bucket lock in old_tbl because until the
 	 * rehash completes ht->tbl won't be changed.
 	 */
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl;
 	if (tbl != old_tbl) {
 		hash = head_hashfn(ht, tbl, obj);
 		spin_lock_nested(bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
@@ -525,7 +524,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	 * visible then that guarantees the entry to still be in
 	 * old_tbl if it exists.
 	 */
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl;
 	if (!ret && old_tbl != tbl)
 		ret = __rhashtable_remove(ht, tbl, obj);
 
@@ -599,7 +598,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
-	const struct bucket_table *tbl, *old_tbl;
+	const struct bucket_table *tbl;
 	struct rhash_head *he;
 	u32 hash;
 
@@ -618,9 +617,8 @@ restart:
 	/* Ensure we see any new tables. */
 	smp_rmb();
 
-	old_tbl = tbl;
-	tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	if (unlikely(tbl != old_tbl))
+	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (unlikely(tbl))
 		goto restart;
 	rcu_read_unlock();
 
@@ -830,14 +828,13 @@ next:
 		iter->skip = 0;
 	}
 
-	iter->walker->tbl = rht_dereference_rcu(ht->future_tbl, ht);
-	if (iter->walker->tbl != tbl) {
+	iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (iter->walker->tbl) {
 		iter->slot = 0;
 		iter->skip = 0;
 		return ERR_PTR(-EAGAIN);
 	}
 
-	iter->walker->tbl = NULL;
 	iter->p = NULL;
 
 out:
@@ -865,8 +862,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	ht = iter->ht;
 
 	mutex_lock(&ht->mutex);
-	if (rht_dereference(ht->tbl, ht) == tbl ||
-	    rht_dereference(ht->future_tbl, ht) == tbl)
+	if (tbl->rehash < tbl->size)
 		list_add(&iter->walker->list, &tbl->walkers);
 	else
 		iter->walker->tbl = NULL;
@@ -961,7 +957,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	atomic_set(&ht->nelems, 0);
 
 	RCU_INIT_POINTER(ht->tbl, tbl);
-	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
 	INIT_WORK(&ht->run_work, rht_deferred_worker);
 
-- 
cgit v1.2.3


From 963ecbd41a1026d99ec7537c050867428c397b89 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 15 Mar 2015 21:12:04 +1100
Subject: rhashtable: Fix use-after-free in rhashtable_walk_stop

The commit c4db8848af6af92f90462258603be844baeab44d ("rhashtable:
Move future_tbl into struct bucket_table") introduced a use-after-
free bug in rhashtable_walk_stop because it dereferences tbl after
droping the RCU read lock.

This patch fixes it by moving the RCU read unlock down to the bottom
of rhashtable_walk_stop.  In fact this was how I had it originally
but it got dropped while rearranging patches because this one
depended on the async freeing of bucket_table.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9d53a46dcca9..b916679b3e3b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -854,10 +854,8 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	struct rhashtable *ht;
 	struct bucket_table *tbl = iter->walker->tbl;
 
-	rcu_read_unlock();
-
 	if (!tbl)
-		return;
+		goto out;
 
 	ht = iter->ht;
 
@@ -869,6 +867,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	mutex_unlock(&ht->mutex);
 
 	iter->p = NULL;
+
+out:
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
-- 
cgit v1.2.3


From 565e86404e4c40e03f602ef0d6d490328f28c493 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 15 Mar 2015 21:12:05 +1100
Subject: rhashtable: Fix rhashtable_remove failures

The commit 9d901bc05153bbf33b5da2cd6266865e531f0545 ("rhashtable:
Free bucket tables asynchronously after rehash") causes gratuitous
failures in rhashtable_remove.

The reason is that it inadvertently introduced multiple rehashing
from the perspective of readers.  IOW it is now possible to see
more than two tables during a single RCU critical section.

Fortunately the other reader rhashtable_lookup already deals with
this correctly thanks to c4db8848af6af92f90462258603be844baeab44d
("rhashtable: rhashtable: Move future_tbl into struct bucket_table")
so only rhashtable_remove is broken by this change.

This patch fixes this by looping over every table from the first
one to the last or until we find the element that we were trying
to delete.

Incidentally the simple test for detecting rehashing to prevent
starting another shrinking no longer works.  Since it isn't needed
anyway (the work queue and the mutex serves as a natural barrier
to unnecessary rehashes) I've simply killed the test.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b916679b3e3b..c523d3a563aa 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -511,28 +511,25 @@ static bool __rhashtable_remove(struct rhashtable *ht,
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl, *old_tbl;
+	struct bucket_table *tbl;
 	bool ret;
 
 	rcu_read_lock();
 
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	ret = __rhashtable_remove(ht, old_tbl, obj);
+	tbl = rht_dereference_rcu(ht->tbl, ht);
 
 	/* Because we have already taken (and released) the bucket
 	 * lock in old_tbl, if we find that future_tbl is not yet
 	 * visible then that guarantees the entry to still be in
-	 * old_tbl if it exists.
+	 * the old tbl if it exists.
 	 */
-	tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl;
-	if (!ret && old_tbl != tbl)
-		ret = __rhashtable_remove(ht, tbl, obj);
+	while (!(ret = __rhashtable_remove(ht, tbl, obj)) &&
+	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+		;
 
 	if (ret) {
-		bool no_resize_running = tbl == old_tbl;
-
 		atomic_dec(&ht->nelems);
-		if (no_resize_running && rht_shrink_below_30(ht, tbl))
+		if (rht_shrink_below_30(ht, tbl))
 			schedule_work(&ht->run_work);
 	}
 
-- 
cgit v1.2.3


From db4374f48a6c31c02f6ad1d19b257c186b443c0c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 16 Mar 2015 10:42:27 +0100
Subject: rhashtable: Annotate RCU locking of walkers

Fixes the following sparse warnings:

lib/rhashtable.c:767:5: warning: context imbalance in 'rhashtable_walk_start' - wrong count at exit
lib/rhashtable.c:849:6: warning: context imbalance in 'rhashtable_walk_stop' - unexpected unlock

Fixes: f2dba9c6ff0d ("rhashtable: Introduce rhashtable_walk_*")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c523d3a563aa..eae26a67bd18 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -760,6 +760,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
  * by calling rhashtable_walk_next.
  */
 int rhashtable_walk_start(struct rhashtable_iter *iter)
+	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
 
@@ -847,6 +848,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next);
  * Finish a hash table walk.
  */
 void rhashtable_walk_stop(struct rhashtable_iter *iter)
+	__releases(RCU)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl = iter->walker->tbl;
-- 
cgit v1.2.3


From 617011e7d5559046e4fc8f87793c8a5d9c3431b0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 16 Mar 2015 10:42:26 +0100
Subject: rhashtable: Avoid calculating hash again to unlock

Caching the lock pointer avoids having to hash on the object
again to unlock the bucket locks.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index eae26a67bd18..09a7ada89ade 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -384,14 +384,16 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	struct rhash_head *head;
 	bool no_resize_running;
 	unsigned hash;
+	spinlock_t *old_lock;
 	bool success = true;
 
 	rcu_read_lock();
 
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = head_hashfn(ht, old_tbl, obj);
+	old_lock = bucket_lock(old_tbl, hash);
 
-	spin_lock_bh(bucket_lock(old_tbl, hash));
+	spin_lock_bh(old_lock);
 
 	/* Because we have already taken the bucket lock in old_tbl,
 	 * if we find that future_tbl is not yet visible then that
@@ -428,13 +430,10 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 		schedule_work(&ht->run_work);
 
 exit:
-	if (tbl != old_tbl) {
-		hash = head_hashfn(ht, tbl, obj);
+	if (tbl != old_tbl)
 		spin_unlock(bucket_lock(tbl, hash));
-	}
 
-	hash = head_hashfn(ht, old_tbl, obj);
-	spin_unlock_bh(bucket_lock(old_tbl, hash));
+	spin_unlock_bh(old_lock);
 
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 6aebd940840a4d3a0a8ffc5883d3892f4bd61e90 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 18 Mar 2015 20:01:15 +1100
Subject: rhashtable: Remove shift from bucket_table

Keeping both size and shift is silly.  We only need one.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 09a7ada89ade..097400362467 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -162,7 +162,6 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 
 	tbl->size = nbuckets;
-	tbl->shift = ilog2(nbuckets);
 
 	if (alloc_bucket_locks(ht, tbl) < 0) {
 		bucket_table_free(tbl);
@@ -189,7 +188,7 @@ static bool rht_grow_above_75(const struct rhashtable *ht,
 {
 	/* Expand table when exceeding 75% load */
 	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_shift || tbl->shift < ht->p.max_shift);
+	       (!ht->p.max_shift || tbl->size < (1 << ht->p.max_shift));
 }
 
 /**
@@ -202,7 +201,7 @@ static bool rht_shrink_below_30(const struct rhashtable *ht,
 {
 	/* Shrink table beneath 30% load */
 	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
-	       tbl->shift > ht->p.min_shift;
+	       tbl->size > (1 << ht->p.min_shift);
 }
 
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
-- 
cgit v1.2.3


From c2e213cff701fce71a0aba8de82f2c2a4acf52ae Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 18 Mar 2015 20:01:16 +1100
Subject: rhashtable: Introduce max_size/min_size

This patch adds the parameters max_size and min_size which are
meant to replace max_shift and min_shift.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 097400362467..c4061bbd0113 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -27,7 +27,7 @@
 #include <linux/err.h>
 
 #define HASH_DEFAULT_SIZE	64UL
-#define HASH_MIN_SIZE		4UL
+#define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU   128UL
 
 /* Base bits plus 1 bit for nulls marker */
@@ -188,7 +188,8 @@ static bool rht_grow_above_75(const struct rhashtable *ht,
 {
 	/* Expand table when exceeding 75% load */
 	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_shift || tbl->size < (1 << ht->p.max_shift));
+	       (!ht->p.max_shift || tbl->size < (1 << ht->p.max_shift)) &&
+	       (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
 /**
@@ -201,7 +202,8 @@ static bool rht_shrink_below_30(const struct rhashtable *ht,
 {
 	/* Shrink table beneath 30% load */
 	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
-	       tbl->size > (1 << ht->p.min_shift);
+	       tbl->size > (1 << ht->p.min_shift) &&
+	       tbl->size > ht->p.min_size;
 }
 
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
@@ -873,7 +875,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-		   1UL << params->min_shift);
+		   max(1UL << params->min_shift,
+		       (unsigned long)params->min_size));
 }
 
 /**
@@ -935,6 +938,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 
 	params->min_shift = max_t(size_t, params->min_shift,
 				  ilog2(HASH_MIN_SIZE));
+	params->min_size = max(params->min_size, HASH_MIN_SIZE);
 
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
-- 
cgit v1.2.3


From e2e21c1c5808e5dfd88d3606cd6386cf85f6f5b1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 18 Mar 2015 20:01:21 +1100
Subject: rhashtable: Remove max_shift and min_shift

Now that nobody uses max_shift and min_shift, we can safely remove
them.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c4061bbd0113..5f8fe3e88219 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -188,7 +188,6 @@ static bool rht_grow_above_75(const struct rhashtable *ht,
 {
 	/* Expand table when exceeding 75% load */
 	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_shift || tbl->size < (1 << ht->p.max_shift)) &&
 	       (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
@@ -202,7 +201,6 @@ static bool rht_shrink_below_30(const struct rhashtable *ht,
 {
 	/* Shrink table beneath 30% load */
 	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
-	       tbl->size > (1 << ht->p.min_shift) &&
 	       tbl->size > ht->p.min_size;
 }
 
@@ -875,8 +873,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-		   max(1UL << params->min_shift,
-		       (unsigned long)params->min_size));
+		   (unsigned long)params->min_size);
 }
 
 /**
@@ -936,8 +933,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
 		return -EINVAL;
 
-	params->min_shift = max_t(size_t, params->min_shift,
-				  ilog2(HASH_MIN_SIZE));
 	params->min_size = max(params->min_size, HASH_MIN_SIZE);
 
 	if (params->nelem_hint)
-- 
cgit v1.2.3


From a998f712f77ea4892d3fcf24e0a67603e63da128 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 19 Mar 2015 22:31:13 +0000
Subject: rhashtable: Round up/down min/max_size to ensure we respect limit

Round up min_size respectively round down max_size to the next power
of two to make sure we always respect the limit specified by the
user. This is required because we compare the table size against the
limit before we expand or shrink.

Also fixes a minor bug where we modified min_size in the params
provided instead of the copy stored in struct rhashtable.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5f8fe3e88219..e75c48d9d82f 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -933,8 +933,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
 		return -EINVAL;
 
-	params->min_size = max(params->min_size, HASH_MIN_SIZE);
-
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
@@ -942,6 +940,14 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	mutex_init(&ht->mutex);
 	memcpy(&ht->p, params, sizeof(*params));
 
+	if (params->min_size)
+		ht->p.min_size = roundup_pow_of_two(params->min_size);
+
+	if (params->max_size)
+		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+
+	ht->p.min_size = max(params->min_size, HASH_MIN_SIZE);
+
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
 	else
-- 
cgit v1.2.3


From 488fb86ee91d3b1182c2e30a9f9b45da14eda46f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 20 Mar 2015 21:56:59 +1100
Subject: rhashtable: Make rhashtable_init params argument const

This patch marks the rhashtable_init params argument const as
there is no reason to modify it since we will always make a copy
of it in the rhashtable.

This patch also fixes a bug where we don't actually round up the
value of min_size unless it is less than HASH_MIN_SIZE.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e75c48d9d82f..e0a9d59f80d6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -870,7 +870,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
-static size_t rounded_hashtable_size(struct rhashtable_params *params)
+static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
 		   (unsigned long)params->min_size);
@@ -919,7 +919,8 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.obj_hashfn = my_hash_fn,
  * };
  */
-int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
 	size_t size;
@@ -946,7 +947,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->max_size)
 		ht->p.max_size = rounddown_pow_of_two(params->max_size);
 
-	ht->p.min_size = max(params->min_size, HASH_MIN_SIZE);
+	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-- 
cgit v1.2.3


From 02fd97c3d4a8a14e222b0021c366db7041d28743 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 20 Mar 2015 21:57:00 +1100
Subject: rhashtable: Allow hash/comparison functions to be inlined

This patch deals with the complaint that we make indirect function
calls on the fast paths unnecessarily in rhashtable.  We resolve
it by moving the fast paths into inline functions that take struct
rhashtable_param (which obviously must be the same set of parameters
supplied to rhashtable_init) as an argument.

The only remaining indirect call is to obj_hashfn (or key_hashfn it
obj_hashfn is unset) on the rehash as well as the insert-during-
rehash slow path.

This patch also extends the support of vairable-length keys to
include those where the key is fixed but scattered in the object.
For example, in netlink we want to key off the namespace and the
portid but they're not next to each other.

This patch does this by directly using the object hash function
as the indicator of whether the key is accessible or not.  It
also adds a new function obj_cmpfn to compare a key against an
object.  This means that the caller no longer needs to supply
explicit compare functions.

All this is done in a backwards compatible manner so no existing
users are affected until they convert to the new interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 163 +++++++++++++++++--------------------------------------
 1 file changed, 50 insertions(+), 113 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e0a9d59f80d6..d1d23fb58525 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -1,13 +1,13 @@
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
- * Based on the following paper:
- * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
- *
  * Code partially derived from nft_hash
+ * Rewritten with rehash code from br_multicast plus single list
+ * pointer as suggested by Josh Triplett
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -30,53 +30,11 @@
 #define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU   128UL
 
-/* Base bits plus 1 bit for nulls marker */
-#define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
-
-/* The bucket lock is selected based on the hash and protects mutations
- * on a group of hash buckets.
- *
- * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
- * a single lock always covers both buckets which may both contains
- * entries which link to the same bucket of the old table during resizing.
- * This allows to simplify the locking as locking the bucket in both
- * tables during resize always guarantee protection.
- *
- * IMPORTANT: When holding the bucket lock of both the old and new table
- * during expansions and shrinking, the old bucket lock must always be
- * acquired first.
- */
-static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
-{
-	return &tbl->locks[hash & tbl->locks_mask];
-}
-
-static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
-{
-	return (void *) he - ht->p.head_offset;
-}
-
-static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
-{
-	return (hash >> HASH_RESERVED_SPACE) & (tbl->size - 1);
-}
-
-static u32 key_hashfn(struct rhashtable *ht, const struct bucket_table *tbl,
-		      const void *key)
-{
-	return rht_bucket_index(tbl, ht->p.hashfn(key, ht->p.key_len,
-						  tbl->hash_rnd));
-}
-
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
 {
-	const char *ptr = rht_obj(ht, he);
-
-	return likely(ht->p.key_len) ?
-	       key_hashfn(ht, tbl, ptr + ht->p.key_offset) :
-	       rht_bucket_index(tbl, ht->p.obj_hashfn(ptr, tbl->hash_rnd));
+	return rht_head_hashfn(ht, tbl, he, ht->p);
 }
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -90,7 +48,7 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	spinlock_t *lock = bucket_lock(tbl, hash);
+	spinlock_t *lock = rht_bucket_lock(tbl, hash);
 
 	return (debug_locks) ? lockdep_is_held(lock) : 1;
 }
@@ -178,32 +136,6 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	return tbl;
 }
 
-/**
- * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
- * @ht:		hash table
- * @tbl:	current table
- */
-static bool rht_grow_above_75(const struct rhashtable *ht,
-			      const struct bucket_table *tbl)
-{
-	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_size || tbl->size < ht->p.max_size);
-}
-
-/**
- * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
- * @ht:		hash table
- * @tbl:	current table
- */
-static bool rht_shrink_below_30(const struct rhashtable *ht,
-				const struct bucket_table *tbl)
-{
-	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
-	       tbl->size > ht->p.min_size;
-}
-
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
@@ -230,7 +162,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 
 	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
 	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
@@ -255,7 +187,7 @@ static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
 
-	old_bucket_lock = bucket_lock(old_tbl, old_hash);
+	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 
 	spin_lock_bh(old_bucket_lock);
 	while (!rhashtable_rehash_one(ht, old_hash))
@@ -376,6 +308,37 @@ unlock:
 	mutex_unlock(&ht->mutex);
 }
 
+int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			   struct rhash_head *obj,
+			   struct bucket_table *tbl)
+{
+	struct rhash_head *head;
+	unsigned hash;
+	int err = -EEXIST;
+
+	hash = head_hashfn(ht, tbl, obj);
+	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+
+	if (key && rhashtable_lookup_fast(ht, key, ht->p))
+		goto exit;
+
+	err = 0;
+
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+
+	RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+
+exit:
+	spin_unlock(rht_bucket_lock(tbl, hash));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
+
 static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 				bool (*compare)(void *, void *), void *arg)
 {
@@ -390,7 +353,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = head_hashfn(ht, old_tbl, obj);
-	old_lock = bucket_lock(old_tbl, hash);
+	old_lock = rht_bucket_lock(old_tbl, hash);
 
 	spin_lock_bh(old_lock);
 
@@ -403,7 +366,8 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 	tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl;
 	if (tbl != old_tbl) {
 		hash = head_hashfn(ht, tbl, obj);
-		spin_lock_nested(bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+		spin_lock_nested(rht_bucket_lock(tbl, hash),
+				 SINGLE_DEPTH_NESTING);
 	}
 
 	if (compare &&
@@ -430,7 +394,7 @@ static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 
 exit:
 	if (tbl != old_tbl)
-		spin_unlock(bucket_lock(tbl, hash));
+		spin_unlock(rht_bucket_lock(tbl, hash));
 
 	spin_unlock_bh(old_lock);
 
@@ -471,7 +435,7 @@ static bool __rhashtable_remove(struct rhashtable *ht,
 	bool ret = false;
 
 	hash = head_hashfn(ht, tbl, obj);
-	lock = bucket_lock(tbl, hash);
+	lock = rht_bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
 
@@ -537,19 +501,6 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
 
-struct rhashtable_compare_arg {
-	struct rhashtable *ht;
-	const void *key;
-};
-
-static bool rhashtable_compare(void *ptr, void *arg)
-{
-	struct rhashtable_compare_arg *x = arg;
-	struct rhashtable *ht = x->ht;
-
-	return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len);
-}
-
 /**
  * rhashtable_lookup - lookup key in hash table
  * @ht:		hash table
@@ -565,14 +516,7 @@ static bool rhashtable_compare(void *ptr, void *arg)
  */
 void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	struct rhashtable_compare_arg arg = {
-		.ht = ht,
-		.key = key,
-	};
-
-	BUG_ON(!ht->p.key_len);
-
-	return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg);
+	return rhashtable_lookup_fast(ht, key, ht->p);
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
 
@@ -591,7 +535,8 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  * Returns the first entry on which the compare function returned true.
  */
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
-				bool (*compare)(void *, void *), void *arg)
+				bool (*compare)(void *, void *),
+				void *arg)
 {
 	const struct bucket_table *tbl;
 	struct rhash_head *he;
@@ -601,7 +546,7 @@ void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
-	hash = key_hashfn(ht, tbl, key);
+	hash = rht_key_hashfn(ht, tbl, key, ht->p);
 	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
@@ -643,15 +588,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
  */
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct rhashtable_compare_arg arg = {
-		.ht = ht,
-		.key = rht_obj(ht, obj) + ht->p.key_offset,
-	};
-
-	BUG_ON(!ht->p.key_len);
-
-	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
-						&arg);
+	return rhashtable_lookup_insert_fast(ht, obj, ht->p);
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
 
@@ -927,8 +864,8 @@ int rhashtable_init(struct rhashtable *ht,
 
 	size = HASH_DEFAULT_SIZE;
 
-	if ((params->key_len && !params->hashfn) ||
-	    (!params->key_len && !params->obj_hashfn))
+	if ((!(params->key_len && params->hashfn) && !params->obj_hashfn) ||
+	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
 	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-- 
cgit v1.2.3


From dc0ee268d85026530720d8c874716287b7ede25b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 20 Mar 2015 21:57:06 +1100
Subject: rhashtable: Rip out obsolete out-of-line interface

Now that all rhashtable users have been converted over to the
inline interface, this patch removes the unused out-of-line
interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 284 -------------------------------------------------------
 1 file changed, 284 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d1d23fb58525..83cfedd6612a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -339,290 +339,6 @@ exit:
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-static bool __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
-				bool (*compare)(void *, void *), void *arg)
-{
-	struct bucket_table *tbl, *old_tbl;
-	struct rhash_head *head;
-	bool no_resize_running;
-	unsigned hash;
-	spinlock_t *old_lock;
-	bool success = true;
-
-	rcu_read_lock();
-
-	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	hash = head_hashfn(ht, old_tbl, obj);
-	old_lock = rht_bucket_lock(old_tbl, hash);
-
-	spin_lock_bh(old_lock);
-
-	/* Because we have already taken the bucket lock in old_tbl,
-	 * if we find that future_tbl is not yet visible then that
-	 * guarantees all other insertions of the same entry will
-	 * also grab the bucket lock in old_tbl because until the
-	 * rehash completes ht->tbl won't be changed.
-	 */
-	tbl = rht_dereference_rcu(old_tbl->future_tbl, ht) ?: old_tbl;
-	if (tbl != old_tbl) {
-		hash = head_hashfn(ht, tbl, obj);
-		spin_lock_nested(rht_bucket_lock(tbl, hash),
-				 SINGLE_DEPTH_NESTING);
-	}
-
-	if (compare &&
-	    rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
-				      compare, arg)) {
-		success = false;
-		goto exit;
-	}
-
-	no_resize_running = tbl == old_tbl;
-
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
-
-	if (rht_is_a_nulls(head))
-		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
-	else
-		RCU_INIT_POINTER(obj->next, head);
-
-	rcu_assign_pointer(tbl->buckets[hash], obj);
-
-	atomic_inc(&ht->nelems);
-	if (no_resize_running && rht_grow_above_75(ht, tbl))
-		schedule_work(&ht->run_work);
-
-exit:
-	if (tbl != old_tbl)
-		spin_unlock(rht_bucket_lock(tbl, hash));
-
-	spin_unlock_bh(old_lock);
-
-	rcu_read_unlock();
-
-	return success;
-}
-
-/**
- * rhashtable_insert - insert object into hash table
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- *
- * Will take a per bucket spinlock to protect against mutual mutations
- * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
- *
- * It is safe to call this function from atomic context.
- *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
- */
-void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
-{
-	__rhashtable_insert(ht, obj, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(rhashtable_insert);
-
-static bool __rhashtable_remove(struct rhashtable *ht,
-				struct bucket_table *tbl,
-				struct rhash_head *obj)
-{
-	struct rhash_head __rcu **pprev;
-	struct rhash_head *he;
-	spinlock_t * lock;
-	unsigned hash;
-	bool ret = false;
-
-	hash = head_hashfn(ht, tbl, obj);
-	lock = rht_bucket_lock(tbl, hash);
-
-	spin_lock_bh(lock);
-
-	pprev = &tbl->buckets[hash];
-	rht_for_each(he, tbl, hash) {
-		if (he != obj) {
-			pprev = &he->next;
-			continue;
-		}
-
-		rcu_assign_pointer(*pprev, obj->next);
-		ret = true;
-		break;
-	}
-
-	spin_unlock_bh(lock);
-
-	return ret;
-}
-
-/**
- * rhashtable_remove - remove object from hash table
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- *
- * Since the hash chain is single linked, the removal operation needs to
- * walk the bucket chain upon removal. The removal operation is thus
- * considerable slow if the hash table is not correctly sized.
- *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
- *
- * The caller must ensure that no concurrent table mutations occur. It is
- * however valid to have concurrent lookups if they are RCU protected.
- */
-bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
-{
-	struct bucket_table *tbl;
-	bool ret;
-
-	rcu_read_lock();
-
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-
-	/* Because we have already taken (and released) the bucket
-	 * lock in old_tbl, if we find that future_tbl is not yet
-	 * visible then that guarantees the entry to still be in
-	 * the old tbl if it exists.
-	 */
-	while (!(ret = __rhashtable_remove(ht, tbl, obj)) &&
-	       (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
-		;
-
-	if (ret) {
-		atomic_dec(&ht->nelems);
-		if (rht_shrink_below_30(ht, tbl))
-			schedule_work(&ht->run_work);
-	}
-
-	rcu_read_unlock();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(rhashtable_remove);
-
-/**
- * rhashtable_lookup - lookup key in hash table
- * @ht:		hash table
- * @key:	pointer to key
- *
- * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key. The first matching entry is returned.
- *
- * This lookup function may only be used for fixed key hash table (key_len
- * parameter set). It will BUG() if used inappropriately.
- *
- * Lookups may occur in parallel with hashtable mutations and resizing.
- */
-void *rhashtable_lookup(struct rhashtable *ht, const void *key)
-{
-	return rhashtable_lookup_fast(ht, key, ht->p);
-}
-EXPORT_SYMBOL_GPL(rhashtable_lookup);
-
-/**
- * rhashtable_lookup_compare - search hash table with compare function
- * @ht:		hash table
- * @key:	the pointer to the key
- * @compare:	compare function, must return true on match
- * @arg:	argument passed on to compare function
- *
- * Traverses the bucket chain behind the provided hash value and calls the
- * specified compare function for each entry.
- *
- * Lookups may occur in parallel with hashtable mutations and resizing.
- *
- * Returns the first entry on which the compare function returned true.
- */
-void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
-				bool (*compare)(void *, void *),
-				void *arg)
-{
-	const struct bucket_table *tbl;
-	struct rhash_head *he;
-	u32 hash;
-
-	rcu_read_lock();
-
-	tbl = rht_dereference_rcu(ht->tbl, ht);
-restart:
-	hash = rht_key_hashfn(ht, tbl, key, ht->p);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (!compare(rht_obj(ht, he), arg))
-			continue;
-		rcu_read_unlock();
-		return rht_obj(ht, he);
-	}
-
-	/* Ensure we see any new tables. */
-	smp_rmb();
-
-	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (unlikely(tbl))
-		goto restart;
-	rcu_read_unlock();
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
-
-/**
- * rhashtable_lookup_insert - lookup and insert object into hash table
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
- * This lookup function may only be used for fixed key hash table (key_len
- * parameter set). It will BUG() if used inappropriately.
- *
- * It is safe to call this function from atomic context.
- *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
- */
-bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
-{
-	return rhashtable_lookup_insert_fast(ht, obj, ht->p);
-}
-EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
-
-/**
- * rhashtable_lookup_compare_insert - search and insert object to hash table
- *                                    with compare function
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @compare:	compare function, must return true on match
- * @arg:	argument passed on to compare function
- *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
- * Lookups may occur in parallel with hashtable mutations and resizing.
- *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
- */
-bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
-				      struct rhash_head *obj,
-				      bool (*compare)(void *, void *),
-				      void *arg)
-{
-	BUG_ON(!ht->p.key_len);
-
-	return __rhashtable_insert(ht, obj, compare, arg);
-}
-EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
-
 /**
  * rhashtable_walk_init - Initialise an iterator
  * @ht:		Table to walk over
-- 
cgit v1.2.3


From d88252f9bb74d266653542b753f9ab404e8b88db Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:19 +1100
Subject: rhashtable: Add barrier to ensure we see new tables in walker

The walker is a lockless reader so it too needs an smp_rmb before
reading the future_tbl field in order to see any new tables that
may contain elements that we should have walked over.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 83cfedd6612a..618a3f00d712 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -477,6 +477,9 @@ next:
 		iter->skip = 0;
 	}
 
+	/* Ensure we see any new tables. */
+	smp_rmb();
+
 	iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (iter->walker->tbl) {
 		iter->slot = 0;
-- 
cgit v1.2.3


From 31ccde2dacea8375c3a7d6fffbf0060ee0d40214 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:21 +1100
Subject: rhashtable: Allow hashfn to be unset

Since every current rhashtable user uses jhash as their hash
function, the fact that jhash is an inline function causes each
user to generate a copy of its code.

This function provides a solution to this problem by allowing
hashfn to be unset.  In which case rhashtable will automatically
set it to jhash.  Furthermore, if the key length is a multiple
of 4, we will switch over to jhash2.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 618a3f00d712..798f01d64ab0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -532,6 +532,11 @@ static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 		   (unsigned long)params->min_size);
 }
 
+static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
+{
+	return jhash2(key, length, seed);
+}
+
 /**
  * rhashtable_init - initialize a new hash table
  * @ht:		hash table to be initialized
@@ -583,7 +588,7 @@ int rhashtable_init(struct rhashtable *ht,
 
 	size = HASH_DEFAULT_SIZE;
 
-	if ((!(params->key_len && params->hashfn) && !params->obj_hashfn) ||
+	if ((!params->key_len && !params->obj_hashfn) ||
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
@@ -610,6 +615,16 @@ int rhashtable_init(struct rhashtable *ht,
 	else
 		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
 
+	ht->key_len = ht->p.key_len;
+	if (!params->hashfn) {
+		ht->p.hashfn = jhash;
+
+		if (!(ht->key_len & (sizeof(u32) - 1))) {
+			ht->key_len /= sizeof(u32);
+			ht->p.hashfn = rhashtable_jhash2;
+		}
+	}
+
 	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 18093d1c0d1e032142ee24825678b0a8977d74ba Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:25 +1100
Subject: rhashtable: Shrink to fit

This patch changes rhashtable_shrink to shrink to the smallest
size possible rather than halving the table.  This is needed
because with multiple rehashing we will defer shrinking until
all other rehashing is done, meaning that when we do shrink
we may be able to shrink a lot.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 798f01d64ab0..9623be345d9c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -261,8 +261,8 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
  * @ht:		the hash table to shrink
  *
- * This function may only be called in a context where it is safe to call
- * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
  *
  * The caller must ensure that no concurrent resizing occurs by holding
  * ht->mutex.
@@ -276,10 +276,17 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
 int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	unsigned size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
 
 	ASSERT_RHT_MUTEX(ht);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size / 2);
+	if (size < ht->p.min_size)
+		size = ht->p.min_size;
+
+	if (old_tbl->size <= size)
+		return 0;
+
+	new_tbl = bucket_table_alloc(ht, size);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From b824478b2145be78ac19e1cf44e2b9036c7a9608 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:26 +1100
Subject: rhashtable: Add multiple rehash support

This patch adds the missing bits to allow multiple rehashes.  The
read-side as well as remove already handle this correctly.  So it's
only the rehasher and insertion that need modification to handle
this.

Note that this patch doesn't actually enable it so for now rehashing
is still only performed by the worker thread.

This patch also disables the explicit expand/shrink interface because
the table is meant to expand and shrink automatically, and continuing
to export these interfaces unnecessarily complicates the life of the
rehasher since the rehash process is now composed of two parts.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 72 insertions(+), 15 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9623be345d9c..5e04403e25f5 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -136,11 +136,24 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	return tbl;
 }
 
+static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
+						  struct bucket_table *tbl)
+{
+	struct bucket_table *new_tbl;
+
+	do {
+		new_tbl = tbl;
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	} while (tbl);
+
+	return new_tbl;
+}
+
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct bucket_table *new_tbl =
-		rht_dereference(old_tbl->future_tbl, ht) ?: old_tbl;
+	struct bucket_table *new_tbl = rhashtable_last_table(ht,
+		rht_dereference_rcu(old_tbl->future_tbl, ht));
 	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
 	int err = -ENOENT;
 	struct rhash_head *head, *next, *entry;
@@ -196,12 +209,18 @@ static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
 	spin_unlock_bh(old_bucket_lock);
 }
 
-static void rhashtable_rehash(struct rhashtable *ht,
-			      struct bucket_table *new_tbl)
+static int rhashtable_rehash_attach(struct rhashtable *ht,
+				    struct bucket_table *old_tbl,
+				    struct bucket_table *new_tbl)
 {
-	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct rhashtable_walker *walker;
-	unsigned old_hash;
+	/* Protect future_tbl using the first bucket lock. */
+	spin_lock_bh(old_tbl->locks);
+
+	/* Did somebody beat us to it? */
+	if (rcu_access_pointer(old_tbl->future_tbl)) {
+		spin_unlock_bh(old_tbl->locks);
+		return -EEXIST;
+	}
 
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
@@ -211,6 +230,22 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	/* Ensure the new table is visible to readers. */
 	smp_wmb();
 
+	spin_unlock_bh(old_tbl->locks);
+
+	return 0;
+}
+
+static int rhashtable_rehash_table(struct rhashtable *ht)
+{
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
+	struct rhashtable_walker *walker;
+	unsigned old_hash;
+
+	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
+	if (!new_tbl)
+		return 0;
+
 	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
 		rhashtable_rehash_chain(ht, old_hash);
 
@@ -225,6 +260,8 @@ static void rhashtable_rehash(struct rhashtable *ht,
 	 * remain.
 	 */
 	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+
+	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
 /**
@@ -242,20 +279,25 @@ static void rhashtable_rehash(struct rhashtable *ht,
  * It is valid to have concurrent insertions and deletions protected by per
  * bucket locks or concurrent RCU protected lookups and traversals.
  */
-int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
+	old_tbl = rhashtable_last_table(ht, old_tbl);
+
 	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	rhashtable_rehash(ht, new_tbl);
-	return 0;
+	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
+	if (err)
+		bucket_table_free(new_tbl);
+
+	return err;
 }
-EXPORT_SYMBOL_GPL(rhashtable_expand);
 
 /**
  * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
@@ -273,10 +315,11 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * It is valid to have concurrent insertions and deletions protected by per
  * bucket locks or concurrent RCU protected lookups and traversals.
  */
-int rhashtable_shrink(struct rhashtable *ht)
+static int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
+	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
@@ -286,19 +329,25 @@ int rhashtable_shrink(struct rhashtable *ht)
 	if (old_tbl->size <= size)
 		return 0;
 
+	if (rht_dereference(old_tbl->future_tbl, ht))
+		return -EEXIST;
+
 	new_tbl = bucket_table_alloc(ht, size);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	rhashtable_rehash(ht, new_tbl);
-	return 0;
+	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
+	if (err)
+		bucket_table_free(new_tbl);
+
+	return err;
 }
-EXPORT_SYMBOL_GPL(rhashtable_shrink);
 
 static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
+	int err = 0;
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
@@ -306,13 +355,20 @@ static void rht_deferred_worker(struct work_struct *work)
 		goto unlock;
 
 	tbl = rht_dereference(ht->tbl, ht);
+	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
 		rhashtable_expand(ht);
 	else if (rht_shrink_below_30(ht, tbl))
 		rhashtable_shrink(ht);
+
+	err = rhashtable_rehash_table(ht);
+
 unlock:
 	mutex_unlock(&ht->mutex);
+
+	if (err)
+		schedule_work(&ht->run_work);
 }
 
 int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
@@ -323,6 +379,7 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 	unsigned hash;
 	int err = -EEXIST;
 
+	tbl = rhashtable_last_table(ht, tbl);
 	hash = head_hashfn(ht, tbl, obj);
 	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 
-- 
cgit v1.2.3


From b9ecfdaa1090b5988422eaf5348ea1954d2d7219 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:27 +1100
Subject: rhashtable: Allow GFP_ATOMIC bucket table allocation

This patch adds the ability to allocate bucket table with GFP_ATOMIC
instead of GFP_KERNEL.  This is needed when we perform an immediate
rehash during insertion.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5e04403e25f5..220a11a13d40 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -58,7 +58,8 @@ EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
 
 
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
+			      gfp_t gfp)
 {
 	unsigned int i, size;
 #if defined(CONFIG_PROVE_LOCKING)
@@ -75,12 +76,13 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
 
 	if (sizeof(spinlock_t) != 0) {
 #ifdef CONFIG_NUMA
-		if (size * sizeof(spinlock_t) > PAGE_SIZE)
+		if (size * sizeof(spinlock_t) > PAGE_SIZE &&
+		    gfp == GFP_KERNEL)
 			tbl->locks = vmalloc(size * sizeof(spinlock_t));
 		else
 #endif
 		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-					   GFP_KERNEL);
+					   gfp);
 		if (!tbl->locks)
 			return -ENOMEM;
 		for (i = 0; i < size; i++)
@@ -105,23 +107,25 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 }
 
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
-					       size_t nbuckets)
+					       size_t nbuckets,
+					       gfp_t gfp)
 {
 	struct bucket_table *tbl = NULL;
 	size_t size;
 	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
-		tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
-	if (tbl == NULL)
+	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
+	    gfp != GFP_KERNEL)
+		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
+	if (tbl == NULL && gfp == GFP_KERNEL)
 		tbl = vzalloc(size);
 	if (tbl == NULL)
 		return NULL;
 
 	tbl->size = nbuckets;
 
-	if (alloc_bucket_locks(ht, tbl) < 0) {
+	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 		bucket_table_free(tbl);
 		return NULL;
 	}
@@ -288,7 +292,7 @@ static int rhashtable_expand(struct rhashtable *ht)
 
 	old_tbl = rhashtable_last_table(ht, old_tbl);
 
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -332,7 +336,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -689,7 +693,7 @@ int rhashtable_init(struct rhashtable *ht,
 		}
 	}
 
-	tbl = bucket_table_alloc(ht, size);
+	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From ccd57b1bd32460d27bbb9c599e795628a3c66983 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 00:50:28 +1100
Subject: rhashtable: Add immediate rehash during insertion

This patch reintroduces immediate rehash during insertion.  If
we find during insertion that the table is full or the chain
length exceeds a set limit (currently 16 but may be disabled
with insecure_elasticity) then we will force an immediate rehash.
The rehash will contain an expansion if the table utilisation
exceeds 75%.

If this rehash fails then the insertion will fail.  Otherwise the
insertion will be reattempted in the new hash table.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 220a11a13d40..7686c1e9934a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -375,21 +375,76 @@ unlock:
 		schedule_work(&ht->run_work);
 }
 
+static bool rhashtable_check_elasticity(struct rhashtable *ht,
+					struct bucket_table *tbl,
+					unsigned hash)
+{
+	unsigned elasticity = ht->elasticity;
+	struct rhash_head *head;
+
+	rht_for_each(head, tbl, hash)
+		if (!--elasticity)
+			return true;
+
+	return false;
+}
+
+int rhashtable_insert_rehash(struct rhashtable *ht)
+{
+	struct bucket_table *old_tbl;
+	struct bucket_table *new_tbl;
+	struct bucket_table *tbl;
+	unsigned int size;
+	int err;
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rhashtable_last_table(ht, old_tbl);
+
+	size = tbl->size;
+
+	if (rht_grow_above_75(ht, tbl))
+		size *= 2;
+	/* More than two rehashes (not resizes) detected. */
+	else if (WARN_ON(old_tbl != tbl && old_tbl->size == size))
+		return -EBUSY;
+
+	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+	if (new_tbl == NULL)
+		return -ENOMEM;
+
+	err = rhashtable_rehash_attach(ht, tbl, new_tbl);
+	if (err) {
+		bucket_table_free(new_tbl);
+		if (err == -EEXIST)
+			err = 0;
+	} else
+		schedule_work(&ht->run_work);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
+
 int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 			   struct rhash_head *obj,
 			   struct bucket_table *tbl)
 {
 	struct rhash_head *head;
 	unsigned hash;
-	int err = -EEXIST;
+	int err;
 
 	tbl = rhashtable_last_table(ht, tbl);
 	hash = head_hashfn(ht, tbl, obj);
 	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 
+	err = -EEXIST;
 	if (key && rhashtable_lookup_fast(ht, key, ht->p))
 		goto exit;
 
+	err = -EAGAIN;
+	if (rhashtable_check_elasticity(ht, tbl, hash) ||
+	    rht_grow_above_100(ht, tbl))
+		goto exit;
+
 	err = 0;
 
 	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
@@ -678,6 +733,9 @@ int rhashtable_init(struct rhashtable *ht,
 
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
+	if (!params->insecure_elasticity)
+		ht->elasticity = 16;
+
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
 	else
-- 
cgit v1.2.3


From ba7c95ea3870fe7b847466d39a049ab6f156aa2c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 09:53:17 +1100
Subject: rhashtable: Fix sleeping inside RCU critical section in walk_stop

The commit 963ecbd41a1026d99ec7537c050867428c397b89 ("rhashtable:
Fix use-after-free in rhashtable_walk_stop") fixed a real bug
but created another one because we may end up sleeping inside an
RCU critical section.

This patch fixes it properly by replacing the mutex with a spin
lock that specifically protects the walker lists.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 7686c1e9934a..e96ad1a52c90 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -256,8 +256,10 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
+	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
+	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
@@ -635,12 +637,12 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 
 	ht = iter->ht;
 
-	mutex_lock(&ht->mutex);
+	spin_lock(&ht->lock);
 	if (tbl->rehash < tbl->size)
 		list_add(&iter->walker->list, &tbl->walkers);
 	else
 		iter->walker->tbl = NULL;
-	mutex_unlock(&ht->mutex);
+	spin_unlock(&ht->lock);
 
 	iter->p = NULL;
 
@@ -723,6 +725,7 @@ int rhashtable_init(struct rhashtable *ht,
 
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
+	spin_lock_init(&ht->lock);
 	memcpy(&ht->p, params, sizeof(*params));
 
 	if (params->min_size)
-- 
cgit v1.2.3


From 27ed44a5d6d88897002b75f53004d4c565a5aab6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 24 Mar 2015 13:37:30 +1100
Subject: rhashtable: Add comment on choice of elasticity value

This patch adds a comment on the choice of the value 16 as the
maximum chain length before we force a rehash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e96ad1a52c90..8514f7c5f029 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -736,6 +736,18 @@ int rhashtable_init(struct rhashtable *ht,
 
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
+	/* The maximum (not average) chain length grows with the
+	 * size of the hash table, at a rate of (log N)/(log log N).
+	 * The value of 16 is selected so that even if the hash
+	 * table grew to 2^32 you would not expect the maximum
+	 * chain length to exceed it unless we are under attack
+	 * (or extremely unlucky).
+	 *
+	 * As this limit is only to detect attacks, we don't need
+	 * to set it to a lower value as you'd need the chain
+	 * length to vastly exceed 16 to have any real effect
+	 * on the system.
+	 */
 	if (!params->insecure_elasticity)
 		ht->elasticity = 16;
 
-- 
cgit v1.2.3


From 299e5c32a37a6bca8175db177117467bd1ce970a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 24 Mar 2015 14:18:17 +0100
Subject: rhashtable: Use 'unsigned int' consistently

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8514f7c5f029..50abe4fec4b8 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -153,7 +153,7 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 	return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
@@ -162,7 +162,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned old_hash)
 	int err = -ENOENT;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
-	unsigned new_hash;
+	unsigned int new_hash;
 
 	rht_for_each(entry, old_tbl, old_hash) {
 		err = 0;
@@ -199,7 +199,8 @@ out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht, unsigned old_hash)
+static void rhashtable_rehash_chain(struct rhashtable *ht,
+				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
@@ -244,7 +245,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
-	unsigned old_hash;
+	unsigned int old_hash;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
@@ -324,11 +325,12 @@ static int rhashtable_expand(struct rhashtable *ht)
 static int rhashtable_shrink(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
-	unsigned size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
+	unsigned int size;
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
+	size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2);
 	if (size < ht->p.min_size)
 		size = ht->p.min_size;
 
@@ -379,9 +381,9 @@ unlock:
 
 static bool rhashtable_check_elasticity(struct rhashtable *ht,
 					struct bucket_table *tbl,
-					unsigned hash)
+					unsigned int hash)
 {
-	unsigned elasticity = ht->elasticity;
+	unsigned int elasticity = ht->elasticity;
 	struct rhash_head *head;
 
 	rht_for_each(head, tbl, hash)
@@ -431,7 +433,7 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 			   struct bucket_table *tbl)
 {
 	struct rhash_head *head;
-	unsigned hash;
+	unsigned int hash;
 	int err;
 
 	tbl = rhashtable_last_table(ht, tbl);
-- 
cgit v1.2.3


From b5e2c150ac914f28a28833b57397bec0b0a2bd5f Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 24 Mar 2015 20:42:19 +0000
Subject: rhashtable: Disable automatic shrinking by default

Introduce a new bool automatic_shrinking to require the
user to explicitly opt-in to automatic shrinking of tables.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 50abe4fec4b8..50374d181148 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -367,7 +367,7 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	if (rht_grow_above_75(ht, tbl))
 		rhashtable_expand(ht);
-	else if (rht_shrink_below_30(ht, tbl))
+	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
 		rhashtable_shrink(ht);
 
 	err = rhashtable_rehash_table(ht);
-- 
cgit v1.2.3


From 6b6f302ceda7a052dab545d6c69abf5f0d4a6cab Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 24 Mar 2015 14:18:20 +0100
Subject: rhashtable: Add rhashtable_free_and_destroy()

rhashtable_destroy() variant which stops rehashes, iterates over
the table and calls a callback to release resources.

Avoids need for nft_hash to embed rhashtable internals and allows to
get rid of the being_destroyed flag. It also saves a 2nd mutex
lock upon destruction.

Also fixes an RCU lockdep splash on nft set destruction due to
calling rht_for_each_entry_safe() without holding bucket locks.
Open code this loop as we need know that no mutations may occur in
parallel.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 49 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 10 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 50374d181148..4b7b7e672b93 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -359,8 +359,6 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
-	if (ht->being_destroyed)
-		goto unlock;
 
 	tbl = rht_dereference(ht->tbl, ht);
 	tbl = rhashtable_last_table(ht, tbl);
@@ -372,7 +370,6 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	err = rhashtable_rehash_table(ht);
 
-unlock:
 	mutex_unlock(&ht->mutex);
 
 	if (err)
@@ -783,21 +780,53 @@ int rhashtable_init(struct rhashtable *ht,
 EXPORT_SYMBOL_GPL(rhashtable_init);
 
 /**
- * rhashtable_destroy - destroy hash table
+ * rhashtable_free_and_destroy - free elements and destroy hash table
  * @ht:		the hash table to destroy
+ * @free_fn:	callback to release resources of element
+ * @arg:	pointer passed to free_fn
  *
- * Frees the bucket array. This function is not rcu safe, therefore the caller
- * has to make sure that no resizing may happen by unpublishing the hashtable
- * and waiting for the quiescent cycle before releasing the bucket array.
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
  */
-void rhashtable_destroy(struct rhashtable *ht)
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+				 void (*free_fn)(void *ptr, void *arg),
+				 void *arg)
 {
-	ht->being_destroyed = true;
+	const struct bucket_table *tbl;
+	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
 
 	mutex_lock(&ht->mutex);
-	bucket_table_free(rht_dereference(ht->tbl, ht));
+	tbl = rht_dereference(ht->tbl, ht);
+	if (free_fn) {
+		for (i = 0; i < tbl->size; i++) {
+			struct rhash_head *pos, *next;
+
+			for (pos = rht_dereference(tbl->buckets[i], ht),
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL;
+			     !rht_is_a_nulls(pos);
+			     pos = next,
+			     next = !rht_is_a_nulls(pos) ?
+					rht_dereference(pos->next, ht) : NULL)
+				free_fn(rht_obj(ht, pos), arg);
+		}
+	}
+
+	bucket_table_free(tbl);
 	mutex_unlock(&ht->mutex);
 }
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+	return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-- 
cgit v1.2.3


From 49f7b33e63fec9d16e7ee62ba8f8ab4159cbdc26 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 25 Mar 2015 13:07:45 +0000
Subject: rhashtable: provide len to obj_hashfn

nftables sets will be converted to use so called setextensions, moving
the key to a non-fixed position. To hash it, the obj_hashfn must be used,
however it so far doesn't receive the length parameter.

Pass the key length to obj_hashfn() and convert existing users.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 lib/rhashtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4b7b7e672b93..4898442b837f 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -691,7 +691,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	struct rhash_head	node;
  * };
  *
- * u32 my_hash_fn(const void *data, u32 seed)
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
  * {
  *	struct test_obj *obj = data;
  *
-- 
cgit v1.2.3


From e2307ed6cbe71c74e291681aaa7e92ab98bc3177 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 22 Apr 2015 09:41:45 +0200
Subject: rhashtable: Schedule async resize when sync realloc fails

When rhashtable_insert_rehash() fails with ENOMEM, this indicates that
we can't allocate the necessary memory in the current context but the
limits as set by the user would still allow to grow.

Thus attempt an async resize in the background where we can allocate
using GFP_KERNEL which is more likely to succeed. The insertion itself
will still fail to indicate pressure.

This fixes a bug where the table would never continue growing once the
utilization is above 100%.

Fixes: ccd57b1bd324 ("rhashtable: Add immediate rehash during insertion")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 4898442b837f..f648cfde8520 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -410,8 +410,13 @@ int rhashtable_insert_rehash(struct rhashtable *ht)
 		return -EBUSY;
 
 	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
-	if (new_tbl == NULL)
+	if (new_tbl == NULL) {
+		/* Schedule async resize/rehash to try allocation
+		 * non-atomic context.
+		 */
+		schedule_work(&ht->run_work);
 		return -ENOMEM;
+	}
 
 	err = rhashtable_rehash_attach(ht, tbl, new_tbl);
 	if (err) {
-- 
cgit v1.2.3


From a87b9ebf1709687ff213091d0fdb4254b1564803 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 22 Apr 2015 09:41:46 +0200
Subject: rhashtable: Do not schedule more than one rehash if we can't grow
 further

The current code currently only stops inserting rehashes into the
chain when no resizes are currently scheduled. As long as resizes
are scheduled and while inserting above the utilization watermark,
more and more rehashes will be scheduled.

This lead to a perfect DoS storm with thousands of rehashes
scheduled which lead to thousands of spinlocks to be taken
sequentially.

Instead, only allow either a series of resizes or a single rehash.
Drop any further rehashes and return -EBUSY.

Fixes: ccd57b1bd324 ("rhashtable: Add immediate rehash during insertion")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/rhashtable.c')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f648cfde8520..b28df4019ade 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -405,8 +405,8 @@ int rhashtable_insert_rehash(struct rhashtable *ht)
 
 	if (rht_grow_above_75(ht, tbl))
 		size *= 2;
-	/* More than two rehashes (not resizes) detected. */
-	else if (WARN_ON(old_tbl != tbl && old_tbl->size == size))
+	/* Do not schedule more than one rehash */
+	else if (old_tbl != tbl)
 		return -EBUSY;
 
 	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
-- 
cgit v1.2.3