diff options
author | David S. Miller <davem@davemloft.net> | 2019-12-27 14:20:10 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-12-27 14:20:10 -0800 |
commit | 2bbc078f812d45b8decb55935dab21199bd21489 (patch) | |
tree | b217e030e0f80a26561cef679e8ae2643162b346 /kernel/bpf | |
parent | 9e41fbf3dd38327d440a8f3ba0c234519dbb5280 (diff) | |
parent | 7c8dce4b166113743adad131b5a24c4acc12f92c (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2019-12-27
The following pull-request contains BPF updates for your *net-next* tree.
We've added 127 non-merge commits during the last 17 day(s) which contain
a total of 110 files changed, 6901 insertions(+), 2721 deletions(-).
There are three merge conflicts. Conflicts and resolution looks as follows:
1) Merge conflict in net/bpf/test_run.c:
There was a tree-wide cleanup c593642c8be0 ("treewide: Use sizeof_field() macro")
which gets in the way with b590cb5f802d ("bpf: Switch to offsetofend in
BPF_PROG_TEST_RUN"):
<<<<<<< HEAD
if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) +
sizeof_field(struct __sk_buff, priority),
=======
if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority),
>>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c
There are a few occasions that look similar to this. Always take the chunk with
offsetofend(). Note that there is one where the fields differ in here:
<<<<<<< HEAD
if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) +
sizeof_field(struct __sk_buff, tstamp),
=======
if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
>>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c
Just take the one with offsetofend() /and/ gso_segs. Latter is correct due to
850a88cc4096 ("bpf: Expose __sk_buff wire_len/gso_segs to BPF_PROG_TEST_RUN").
2) Merge conflict in arch/riscv/net/bpf_jit_comp.c:
(I'm keeping Bjorn in Cc here for a double-check in case I got it wrong.)
<<<<<<< HEAD
if (is_13b_check(off, insn))
return -1;
emit(rv_blt(tcc, RV_REG_ZERO, off >> 1), ctx);
=======
emit_branch(BPF_JSLT, RV_REG_T1, RV_REG_ZERO, off, ctx);
>>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c
Result should look like:
emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx);
3) Merge conflict in arch/riscv/include/asm/pgtable.h:
<<<<<<< HEAD
=======
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END (PAGE_OFFSET - 1)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END (VMALLOC_END)
/*
* Roughly size the vmemmap space to be large enough to fit enough
* struct pages to map half the virtual address space. Then
* position vmemmap directly below the VMALLOC region.
*/
#define VMEMMAP_SHIFT \
(CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
#define VMEMMAP_END (VMALLOC_START - 1)
#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE)
#define vmemmap ((struct page *)VMEMMAP_START)
>>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c
Only take the BPF_* defines from there and move them higher up in the
same file. Remove the rest from the chunk. The VMALLOC_* etc defines
got moved via 01f52e16b868 ("riscv: define vmemmap before pfn_to_page
calls"). Result:
[...]
#define __S101 PAGE_READ_EXEC
#define __S110 PAGE_SHARED_EXEC
#define __S111 PAGE_SHARED_EXEC
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END (PAGE_OFFSET - 1)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END (VMALLOC_END)
/*
* Roughly size the vmemmap space to be large enough to fit enough
* struct pages to map half the virtual address space. Then
* position vmemmap directly below the VMALLOC region.
*/
#define VMEMMAP_SHIFT \
(CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
#define VMEMMAP_END (VMALLOC_START - 1)
#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE)
[...]
Let me know if there are any other issues.
Anyway, the main changes are:
1) Extend bpftool to produce a struct (aka "skeleton") tailored and specific
to a provided BPF object file. This provides an alternative, simplified API
compared to standard libbpf interaction. Also, add libbpf extern variable
resolution for .kconfig section to import Kconfig data, from Andrii Nakryiko.
2) Add BPF dispatcher for XDP which is a mechanism to avoid indirect calls by
generating a branch funnel as discussed back in bpfconf'19 at LSF/MM. Also,
add various BPF riscv JIT improvements, from Björn Töpel.
3) Extend bpftool to allow matching BPF programs and maps by name,
from Paul Chaignon.
4) Support for replacing cgroup BPF programs attached with BPF_F_ALLOW_MULTI
flag for allowing updates without service interruption, from Andrey Ignatov.
5) Cleanup and simplification of ring access functions for AF_XDP with a
bonus of 0-5% performance improvement, from Magnus Karlsson.
6) Enable BPF JITs for x86-64 and arm64 by default. Also, final version of
audit support for BPF, from Daniel Borkmann and latter with Jiri Olsa.
7) Move and extend test_select_reuseport into BPF program tests under
BPF selftests, from Jakub Sitnicki.
8) Various BPF sample improvements for xdpsock for customizing parameters
to set up and benchmark AF_XDP, from Jay Jayatheerthan.
9) Improve libbpf to provide a ulimit hint on permission denied errors.
Also change XDP sample programs to attach in driver mode by default,
from Toke Høiland-Jørgensen.
10) Extend BPF test infrastructure to allow changing skb mark from tc BPF
programs, from Nikita V. Shirokov.
11) Optimize prologue code sequence in BPF arm32 JIT, from Russell King.
12) Fix xdp_redirect_cpu BPF sample to manually attach to tracepoints after
libbpf conversion, from Jesper Dangaard Brouer.
13) Minor misc improvements from various others.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 97 | ||||
-rw-r--r-- | kernel/bpf/core.c | 6 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 76 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 78 | ||||
-rw-r--r-- | kernel/bpf/dispatcher.c | 158 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 63 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 24 | ||||
-rw-r--r-- | kernel/bpf/xskmap.c | 18 |
9 files changed, 329 insertions, 192 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3f671bf617e8..d4f330351f87 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4fb20ab179fe..ee871d4a51f7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type, - u32 new_flags) + enum bpf_attach_type type) { struct cgroup *p; @@ -283,31 +282,34 @@ cleanup: * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Must be called with cgroup_mutex held. */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags) { + u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; - struct bpf_prog_list *pl; - bool pl_was_allocated; int err; - if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || + ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type, flags)) + if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -317,6 +319,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + if (pl->prog == replace_prog) + replace_pl = pl; + } + if ((flags & BPF_F_REPLACE) && !replace_pl) + /* prog to replace not found for cgroup */ + return -ENOENT; + } else if (!list_empty(progs)) { + replace_pl = list_first_entry(progs, typeof(*pl), node); + } + for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storage[stype])) { @@ -327,53 +344,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { - /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -EINVAL; - } + if (replace_pl) { + pl = replace_pl; + old_prog = pl->prog; + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); } - + } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } - - pl_was_allocated = true; - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); - } else { - if (list_empty(progs)) { - pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - pl_was_allocated = true; - list_add_tail(&pl->node, progs); - } else { - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } - pl_was_allocated = false; - } - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } - cgrp->bpf.flags[type] = flags; + pl->prog = prog; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; + + cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) @@ -401,7 +393,7 @@ cleanup: pl->storage[stype] = old_storage[stype]; bpf_cgroup_storage_link(old_storage[stype], cgrp, type); } - if (pl_was_allocated) { + if (!replace_pl) { list_del(&pl->node); kfree(pl); } @@ -539,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { + struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; @@ -546,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && + (attr->attach_flags & BPF_F_REPLACE)) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); + if (IS_ERR(replace_prog)) { + cgroup_put(cgrp); + return PTR_ERR(replace_prog); + } + } + + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, attr->attach_flags); + + if (replace_prog) + bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index af6b738cf435..29d47aae0dd1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, u32 pages, delta; int ret; - BUG_ON(fp_old == NULL); - size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) @@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ -int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; -int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; static __always_inline void diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef49e17ae47c..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + +static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -399,22 +390,14 @@ free_rcu: static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; - int cpu; /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(bq, false); - } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ put_cpu_map_entry(rcpu); @@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) * percpu bulkq to queue. Due to caller map_delete_elem() disable * preemption, cannot call kthread_stop() to make sure queue is empty. * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before + * cpu_map_kthread_stop, which waits for an RCU grace period before * stopping kthread, emptying the queue. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, @@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, @@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); - /* To ensure all pending flush operations have completed wait for flush - * list be empty on _all_ cpus. Because the above synchronize_rcu() - * ensures the map is disconnected from the program we can assume no new - * items will be added to the list. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - /* For cpu_map the remote CPUs can still be using the entries * (struct bpf_cpu_map_entry). */ @@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU graze-period */ + /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -201,7 +189,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -221,18 +209,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -266,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -345,8 +320,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -384,11 +358,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -401,15 +371,14 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -436,11 +405,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -509,27 +478,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); @@ -810,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c new file mode 100644 index 000000000000..204ee61a3904 --- /dev/null +++ b/kernel/bpf/dispatcher.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2019 Intel Corporation. */ + +#include <linux/hash.h> +#include <linux/bpf.h> +#include <linux/filter.h> + +/* The BPF dispatcher is a multiway branch code generator. The + * dispatcher is a mechanism to avoid the performance penalty of an + * indirect call, which is expensive when retpolines are enabled. A + * dispatch client registers a BPF program into the dispatcher, and if + * there is available room in the dispatcher a direct call to the BPF + * program will be generated. All calls to the BPF programs called via + * the dispatcher will then be a direct call, instead of an + * indirect. The dispatcher hijacks a trampoline function it via the + * __fentry__ of the trampoline. The trampoline function has the + * following signature: + * + * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, + * unsigned int (*bpf_func)(const void *, + * const struct bpf_insn *)); + */ + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( + struct bpf_dispatcher *d, struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (prog == d->progs[i].prog) + return &d->progs[i]; + } + return NULL; +} + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( + struct bpf_dispatcher *d) +{ + return bpf_dispatcher_find_prog(d, NULL); +} + +static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (entry) { + refcount_inc(&entry->users); + return false; + } + + entry = bpf_dispatcher_find_free(d); + if (!entry) + return false; + + bpf_prog_inc(prog); + entry->prog = prog; + refcount_set(&entry->users, 1); + d->num_progs++; + return true; +} + +static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (!entry) + return false; + + if (refcount_dec_and_test(&entry->users)) { + entry->prog = NULL; + bpf_prog_put(prog); + d->num_progs--; + return true; + } + return false; +} + +int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + return -ENOTSUPP; +} + +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +{ + s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (d->progs[i].prog) + *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; + } + return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); +} + +static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) +{ + void *old, *new; + u32 noff; + int err; + + if (!prev_num_progs) { + old = NULL; + noff = 0; + } else { + old = d->image + d->image_off; + noff = d->image_off ^ (PAGE_SIZE / 2); + } + + new = d->num_progs ? d->image + noff : NULL; + if (new) { + if (bpf_dispatcher_prepare(d, new)) + return; + } + + err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); + if (err || !new) + return; + + d->image_off = noff; +} + +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to) +{ + bool changed = false; + int prev_num_progs; + + if (from == to) + return; + + mutex_lock(&d->mutex); + if (!d->image) { + d->image = bpf_jit_alloc_exec_page(); + if (!d->image) + goto out; + } + + prev_num_progs = d->num_progs; + changed |= bpf_dispatcher_remove_prog(d, from); + changed |= bpf_dispatcher_add_prog(d, to); + + if (!changed) + goto out; + + bpf_dispatcher_update(d, prev_num_progs); +out: + mutex_unlock(&d->mutex); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3461ec59570..81ee8595dfee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include <linux/timekeeping.h> #include <linux/ctype.h> #include <linux/nospec.h> +#include <linux/audit.h> #include <uapi/linux/btf.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } +enum bpf_audit { + BPF_AUDIT_LOAD, + BPF_AUDIT_UNLOAD, + BPF_AUDIT_MAX, +}; + +static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { + [BPF_AUDIT_LOAD] = "LOAD", + [BPF_AUDIT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) +{ + struct audit_context *ctx = NULL; + struct audit_buffer *ab; + + if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) + return; + if (audit_enabled == AUDIT_OFF) + return; + if (op == BPF_AUDIT_LOAD) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + audit_log_format(ab, "prog-id=%u op=%s", + prog->aux->id, bpf_audit_str[op]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) @@ -2040,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd #define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) static int bpf_prog_attach(const union bpf_attr *attr) { @@ -2305,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id -static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; - u32 id = attr->prog_id; - int fd; - - if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); @@ -2324,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); + return prog; +} + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 23b0d5cfd47e..505f4e4b31d2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -14,6 +14,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +void *bpf_jit_alloc_exec_page(void) +{ + void *image; + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return NULL; + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + return image; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -34,7 +50,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -48,12 +64,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. - */ - set_memory_x((long)image, 1); tr->image = image; out: mutex_unlock(&trampoline_mutex); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } |