summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/bpf/hashtab.c65
1 files changed, 56 insertions, 9 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9a3b926e915c..53d9483fee10 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,10 +46,43 @@
* from one of these contexts completed. sys_bpf() uses the same mechanism
* by pinning the task to the current CPU and incrementing the recursion
* protection accross the map operation.
+ *
+ * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
+ * operations like memory allocations (even with GFP_ATOMIC) from atomic
+ * contexts. This is required because even with GFP_ATOMIC the memory
+ * allocator calls into code pathes which acquire locks with long held lock
+ * sections. To ensure the deterministic behaviour these locks are regular
+ * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
+ * true atomic contexts on an RT kernel are the low level hardware
+ * handling, scheduling, low level interrupt handling, NMIs etc. None of
+ * these contexts should ever do memory allocations.
+ *
+ * As regular device interrupt handlers and soft interrupts are forced into
+ * thread context, the existing code which does
+ * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * just works.
+ *
+ * In theory the BPF locks could be converted to regular spinlocks as well,
+ * but the bucket locks and percpu_freelist locks can be taken from
+ * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
+ * atomic contexts even on RT. These mechanisms require preallocated maps,
+ * so there is no need to invoke memory allocations within the lock held
+ * sections.
+ *
+ * BPF maps which need dynamic allocation are only used from (forced)
+ * thread context on RT and can therefore use regular spinlocks which in
+ * turn allows to invoke memory allocations from the lock held section.
+ *
+ * On a non RT kernel this distinction is neither possible nor required.
+ * spinlock maps to raw_spinlock and the extra code is optimized out by the
+ * compiler.
*/
struct bucket {
struct hlist_nulls_head head;
- raw_spinlock_t lock;
+ union {
+ raw_spinlock_t raw_lock;
+ spinlock_t lock;
+ };
};
struct bpf_htab {
@@ -88,13 +121,26 @@ struct htab_elem {
char key[0] __aligned(8);
};
+static inline bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
+{
+ return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
+}
+
static void htab_init_buckets(struct bpf_htab *htab)
{
unsigned i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- raw_spin_lock_init(&htab->buckets[i].lock);
+ if (htab_use_raw_lock(htab))
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ else
+ spin_lock_init(&htab->buckets[i].lock);
}
}
@@ -103,7 +149,10 @@ static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,
{
unsigned long flags;
- raw_spin_lock_irqsave(&b->lock, flags);
+ if (htab_use_raw_lock(htab))
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
+ else
+ spin_lock_irqsave(&b->lock, flags);
return flags;
}
@@ -111,7 +160,10 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b,
unsigned long flags)
{
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ if (htab_use_raw_lock(htab))
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+ else
+ spin_unlock_irqrestore(&b->lock, flags);
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -128,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
-static bool htab_is_prealloc(const struct bpf_htab *htab)
-{
- return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
-}
-
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{