From 3cf92222a39cc7842c373dd90a0c204fa7d7cced Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 3 Dec 2015 20:41:29 +0800 Subject: rhashtable: Prevent spurious EBUSY errors on insertion Thomas and Phil observed that under stress rhashtable insertion sometimes failed with EBUSY, even though this error should only ever been seen when we're under attack and our hash chain length has grown to an unacceptable level, even after a rehash. It turns out that the logic for detecting whether there is an existing rehash is faulty. In particular, when two threads both try to grow the same table at the same time, one of them may see the newly grown table and thus erroneously conclude that it had been rehashed. This is what leads to the EBUSY error. This patch fixes this by remembering the current last table we used during insertion so that rhashtable_insert_rehash can detect when another thread has also done a resize/rehash. When this is detected we will give up our resize/rehash and simply retry the insertion with the new table. Reported-by: Thomas Graf Reported-by: Phil Sutter Signed-off-by: Herbert Xu Tested-by: Phil Sutter Signed-off-by: David S. Miller --- lib/rhashtable.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a54ff8949f91..2ff7ed91663a 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -389,33 +389,31 @@ static bool rhashtable_check_elasticity(struct rhashtable *ht, return false; } -int rhashtable_insert_rehash(struct rhashtable *ht) +int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; - struct bucket_table *tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = rhashtable_last_table(ht, old_tbl); size = tbl->size; + err = -EBUSY; + if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) - return -EBUSY; + goto fail; + + err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); - if (new_tbl == NULL) { - /* Schedule async resize/rehash to try allocation - * non-atomic context. - */ - schedule_work(&ht->run_work); - return -ENOMEM; - } + if (new_tbl == NULL) + goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { @@ -426,12 +424,24 @@ int rhashtable_insert_rehash(struct rhashtable *ht) schedule_work(&ht->run_work); return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_dereference_raw(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; } EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -int rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, + const void *key, + struct rhash_head *obj, + struct bucket_table *tbl) { struct rhash_head *head; unsigned int hash; @@ -467,7 +477,12 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key, exit: spin_unlock(rht_bucket_lock(tbl, hash)); - return err; + if (err == 0) + return NULL; + else if (err == -EAGAIN) + return tbl; + else + return ERR_PTR(err); } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); -- cgit v1.2.3 From d3716f18a7d841565c930efde30737a3557eee69 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 4 Dec 2015 22:39:56 +0800 Subject: rhashtable: Use __vmalloc with GFP_ATOMIC for table allocation When an rhashtable user pounds rhashtable hard with back-to-back insertions we may end up growing the table in GFP_ATOMIC context. Unfortunately when the table reaches a certain size this often fails because we don't have enough physically contiguous pages to hold the new table. Eric Dumazet suggested (and in fact wrote this patch) using __vmalloc instead which can be used in GFP_ATOMIC context. Reported-by: Phil Sutter Suggested-by: Eric Dumazet Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 2ff7ed91663a..1c624db90e88 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -120,8 +120,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || gfp != GFP_KERNEL) tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); + if (tbl == NULL) + tbl = __vmalloc(size, gfp | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); if (tbl == NULL) return NULL; -- cgit v1.2.3 From a90099d9fabd2458084b9c2b79f1a62d9b76a61a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 5 Dec 2015 22:47:11 -0500 Subject: Revert "rhashtable: Use __vmalloc with GFP_ATOMIC for table allocation" This reverts commit d3716f18a7d841565c930efde30737a3557eee69. vmalloc cannot be used in BH disabled contexts, even with GFP_ATOMIC. And we certainly want to support rhashtable users inserting entries with software interrupts disabled. Signed-off-by: David S. Miller --- lib/rhashtable.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 1c624db90e88..2ff7ed91663a 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -120,9 +120,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || gfp != GFP_KERNEL) tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL) - tbl = __vmalloc(size, gfp | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + if (tbl == NULL && gfp == GFP_KERNEL) + tbl = vzalloc(size); if (tbl == NULL) return NULL; -- cgit v1.2.3 From 3a324606bbabfc30084ce9d08169910773ba9a92 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 16 Dec 2015 18:13:14 +0800 Subject: rhashtable: Enforce minimum size on initial hash table William Hua wrote: > > I wasn't aware there was an enforced minimum size. I simply set the > nelem_hint in the rhastable_params struct to 1, expecting it to grow as > needed. This caused a segfault afterwards when trying to insert an > element. OK we're doing the size computation before we enforce the limit on min_size. ---8<--- We need to do the initial hash table size computation after we have obtained the correct min_size/max_size parameters. Otherwise we may end up with a hash table whose size is outside the allowed envelope. Fixes: a998f712f77e ("rhashtable: Round up/down min/max_size to...") Reported-by: William Hua Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 2ff7ed91663a..a98e71db7dd2 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -738,9 +738,6 @@ int rhashtable_init(struct rhashtable *ht, if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) return -EINVAL; - if (params->nelem_hint) - size = rounded_hashtable_size(params); - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -760,6 +757,9 @@ int rhashtable_init(struct rhashtable *ht, ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + if (params->nelem_hint) + size = rounded_hashtable_size(&ht->p); + /* The maximum (not average) chain length grows with the * size of the hash table, at a rate of (log N)/(log log N). * The value of 16 is selected so that even if the hash -- cgit v1.2.3 From c6ff5268293ef98e48a99597e765ffc417e39fa5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 16 Dec 2015 16:45:54 +0800 Subject: rhashtable: Fix walker list corruption The commit ba7c95ea3870fe7b847466d39a049ab6f156aa2c ("rhashtable: Fix sleeping inside RCU critical section in walk_stop") introduced a new spinlock for the walker list. However, it did not convert all existing users of the list over to the new spin lock. Some continued to use the old mutext for this purpose. This obviously led to corruption of the list. The fix is to use the spin lock everywhere where we touch the list. This also allows us to do rcu_rad_lock before we take the lock in rhashtable_walk_start. With the old mutex this would've deadlocked but it's safe with the new spin lock. Fixes: ba7c95ea3870 ("rhashtable: Fix sleeping inside RCU...") Reported-by: Colin Ian King Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a98e71db7dd2..eb9240c458fa 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -518,10 +518,10 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) if (!iter->walker) return -ENOMEM; - mutex_lock(&ht->mutex); + spin_lock(&ht->lock); iter->walker->tbl = rht_dereference(ht->tbl, ht); list_add(&iter->walker->list, &iter->walker->tbl->walkers); - mutex_unlock(&ht->mutex); + spin_unlock(&ht->lock); return 0; } @@ -535,10 +535,10 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { - mutex_lock(&iter->ht->mutex); + spin_lock(&iter->ht->lock); if (iter->walker->tbl) list_del(&iter->walker->list); - mutex_unlock(&iter->ht->mutex); + spin_unlock(&iter->ht->lock); kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -562,14 +562,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) { struct rhashtable *ht = iter->ht; - mutex_lock(&ht->mutex); + rcu_read_lock(); + spin_lock(&ht->lock); if (iter->walker->tbl) list_del(&iter->walker->list); - - rcu_read_lock(); - - mutex_unlock(&ht->mutex); + spin_unlock(&ht->lock); if (!iter->walker->tbl) { iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); -- cgit v1.2.3