From 3cf92222a39cc7842c373dd90a0c204fa7d7cced Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 3 Dec 2015 20:41:29 +0800
Subject: rhashtable: Prevent spurious EBUSY errors on insertion

Thomas and Phil observed that under stress rhashtable insertion
sometimes failed with EBUSY, even though this error should only
ever been seen when we're under attack and our hash chain length
has grown to an unacceptable level, even after a rehash.

It turns out that the logic for detecting whether there is an
existing rehash is faulty.  In particular, when two threads both
try to grow the same table at the same time, one of them may see
the newly grown table and thus erroneously conclude that it had
been rehashed.  This is what leads to the EBUSY error.

This patch fixes this by remembering the current last table we
used during insertion so that rhashtable_insert_rehash can detect
when another thread has also done a resize/rehash.  When this is
detected we will give up our resize/rehash and simply retry the
insertion with the new table.

Reported-by: Thomas Graf <tgraf@suug.ch>
Reported-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index a54ff8949f91..2ff7ed91663a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -389,33 +389,31 @@ static bool rhashtable_check_elasticity(struct rhashtable *ht,
 	return false;
 }
 
-int rhashtable_insert_rehash(struct rhashtable *ht)
+int rhashtable_insert_rehash(struct rhashtable *ht,
+			     struct bucket_table *tbl)
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
-	struct bucket_table *tbl;
 	unsigned int size;
 	int err;
 
 	old_tbl = rht_dereference_rcu(ht->tbl, ht);
-	tbl = rhashtable_last_table(ht, old_tbl);
 
 	size = tbl->size;
 
+	err = -EBUSY;
+
 	if (rht_grow_above_75(ht, tbl))
 		size *= 2;
 	/* Do not schedule more than one rehash */
 	else if (old_tbl != tbl)
-		return -EBUSY;
+		goto fail;
+
+	err = -ENOMEM;
 
 	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
-	if (new_tbl == NULL) {
-		/* Schedule async resize/rehash to try allocation
-		 * non-atomic context.
-		 */
-		schedule_work(&ht->run_work);
-		return -ENOMEM;
-	}
+	if (new_tbl == NULL)
+		goto fail;
 
 	err = rhashtable_rehash_attach(ht, tbl, new_tbl);
 	if (err) {
@@ -426,12 +424,24 @@ int rhashtable_insert_rehash(struct rhashtable *ht)
 		schedule_work(&ht->run_work);
 
 	return err;
+
+fail:
+	/* Do not fail the insert if someone else did a rehash. */
+	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+		return 0;
+
+	/* Schedule async rehash to retry allocation in process context. */
+	if (err == -ENOMEM)
+		schedule_work(&ht->run_work);
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
 
-int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
-			   struct rhash_head *obj,
-			   struct bucket_table *tbl)
+struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+					    const void *key,
+					    struct rhash_head *obj,
+					    struct bucket_table *tbl)
 {
 	struct rhash_head *head;
 	unsigned int hash;
@@ -467,7 +477,12 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 exit:
 	spin_unlock(rht_bucket_lock(tbl, hash));
 
-	return err;
+	if (err == 0)
+		return NULL;
+	else if (err == -EAGAIN)
+		return tbl;
+	else
+		return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-- 
cgit v1.2.3


From d3716f18a7d841565c930efde30737a3557eee69 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 4 Dec 2015 22:39:56 +0800
Subject: rhashtable: Use __vmalloc with GFP_ATOMIC for table allocation

When an rhashtable user pounds rhashtable hard with back-to-back
insertions we may end up growing the table in GFP_ATOMIC context.
Unfortunately when the table reaches a certain size this often
fails because we don't have enough physically contiguous pages
to hold the new table.

Eric Dumazet suggested (and in fact wrote this patch) using
__vmalloc instead which can be used in GFP_ATOMIC context.

Reported-by: Phil Sutter <phil@nwl.cc>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2ff7ed91663a..1c624db90e88 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -120,8 +120,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
 	    gfp != GFP_KERNEL)
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-	if (tbl == NULL && gfp == GFP_KERNEL)
-		tbl = vzalloc(size);
+	if (tbl == NULL)
+		tbl = __vmalloc(size, gfp | __GFP_HIGHMEM | __GFP_ZERO,
+				PAGE_KERNEL);
 	if (tbl == NULL)
 		return NULL;
 
-- 
cgit v1.2.3


From a90099d9fabd2458084b9c2b79f1a62d9b76a61a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 5 Dec 2015 22:47:11 -0500
Subject: Revert "rhashtable: Use __vmalloc with GFP_ATOMIC for table
 allocation"

This reverts commit d3716f18a7d841565c930efde30737a3557eee69.

vmalloc cannot be used in BH disabled contexts, even
with GFP_ATOMIC.  And we certainly want to support
rhashtable users inserting entries with software
interrupts disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1c624db90e88..2ff7ed91663a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -120,9 +120,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
 	    gfp != GFP_KERNEL)
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-	if (tbl == NULL)
-		tbl = __vmalloc(size, gfp | __GFP_HIGHMEM | __GFP_ZERO,
-				PAGE_KERNEL);
+	if (tbl == NULL && gfp == GFP_KERNEL)
+		tbl = vzalloc(size);
 	if (tbl == NULL)
 		return NULL;
 
-- 
cgit v1.2.3


From 3a324606bbabfc30084ce9d08169910773ba9a92 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 16 Dec 2015 18:13:14 +0800
Subject: rhashtable: Enforce minimum size on initial hash table

William Hua <william.hua@canonical.com> wrote:
>
> I wasn't aware there was an enforced minimum size. I simply set the
> nelem_hint in the rhastable_params struct to 1, expecting it to grow as
> needed. This caused a segfault afterwards when trying to insert an
> element.

OK we're doing the size computation before we enforce the limit
on min_size.

---8<---
We need to do the initial hash table size computation after we
have obtained the correct min_size/max_size parameters.  Otherwise
we may end up with a hash table whose size is outside the allowed
envelope.

Fixes: a998f712f77e ("rhashtable: Round up/down min/max_size to...")
Reported-by: William Hua <william.hua@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2ff7ed91663a..a98e71db7dd2 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -738,9 +738,6 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
 		return -EINVAL;
 
-	if (params->nelem_hint)
-		size = rounded_hashtable_size(params);
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -760,6 +757,9 @@ int rhashtable_init(struct rhashtable *ht,
 
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
+	if (params->nelem_hint)
+		size = rounded_hashtable_size(&ht->p);
+
 	/* The maximum (not average) chain length grows with the
 	 * size of the hash table, at a rate of (log N)/(log log N).
 	 * The value of 16 is selected so that even if the hash
-- 
cgit v1.2.3


From c6ff5268293ef98e48a99597e765ffc417e39fa5 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 16 Dec 2015 16:45:54 +0800
Subject: rhashtable: Fix walker list corruption

The commit ba7c95ea3870fe7b847466d39a049ab6f156aa2c ("rhashtable:
Fix sleeping inside RCU critical section in walk_stop") introduced
a new spinlock for the walker list.  However, it did not convert
all existing users of the list over to the new spin lock.  Some
continued to use the old mutext for this purpose.  This obviously
led to corruption of the list.

The fix is to use the spin lock everywhere where we touch the list.

This also allows us to do rcu_rad_lock before we take the lock in
rhashtable_walk_start.  With the old mutex this would've deadlocked
but it's safe with the new spin lock.

Fixes: ba7c95ea3870 ("rhashtable: Fix sleeping inside RCU...")
Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index a98e71db7dd2..eb9240c458fa 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -518,10 +518,10 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
 	if (!iter->walker)
 		return -ENOMEM;
 
-	mutex_lock(&ht->mutex);
+	spin_lock(&ht->lock);
 	iter->walker->tbl = rht_dereference(ht->tbl, ht);
 	list_add(&iter->walker->list, &iter->walker->tbl->walkers);
-	mutex_unlock(&ht->mutex);
+	spin_unlock(&ht->lock);
 
 	return 0;
 }
@@ -535,10 +535,10 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);
  */
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
-	mutex_lock(&iter->ht->mutex);
+	spin_lock(&iter->ht->lock);
 	if (iter->walker->tbl)
 		list_del(&iter->walker->list);
-	mutex_unlock(&iter->ht->mutex);
+	spin_unlock(&iter->ht->lock);
 	kfree(iter->walker);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
@@ -562,14 +562,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter)
 {
 	struct rhashtable *ht = iter->ht;
 
-	mutex_lock(&ht->mutex);
+	rcu_read_lock();
 
+	spin_lock(&ht->lock);
 	if (iter->walker->tbl)
 		list_del(&iter->walker->list);
-
-	rcu_read_lock();
-
-	mutex_unlock(&ht->mutex);
+	spin_unlock(&ht->lock);
 
 	if (!iter->walker->tbl) {
 		iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
-- 
cgit v1.2.3