diff options
author | Daniel Borkmann <daniel@iogearbox.net> | 2018-05-16 22:02:14 +0200 |
---|---|---|
committer | Daniel Borkmann <daniel@iogearbox.net> | 2018-05-16 22:02:23 +0200 |
commit | 5b26ace65012cbf3b54a68c39041bcd8ae5e31a7 (patch) | |
tree | cc46bf2ebf4d8c1c2fef46affd67d622b405dedc | |
parent | f2467c2dbc019548052f3a64dc1efd01c0ae27aa (diff) | |
parent | 62c52d1fddb5ef201e3a25d7bd1b79fcb0ca42b8 (diff) |
Merge branch 'bpf-sock-hashmap'
John Fastabend says:
====================
In the original sockmap implementation we got away with using an
array similar to devmap. However, unlike devmap where an ifindex
has a nice 1:1 function into the map we have found some use cases
with sockets that need to be referenced using longer keys.
This series adds support for a sockhash map reusing as much of
the sockmap code as possible. I made the decision to add sockhash
specific helpers vs trying to generalize the existing helpers
because (a) they have sockmap in the name and (b) the keys are
different types. I prefer to be explicit here rather than play
type games or do something else tricky.
To test this we duplicate all the sockmap testing except swap out
the sockmap with a sockhash.
v2: fix file stats and add v2 tag
v3: move tool updates into test patch, move bpftool updates into
its own patch, and fixup the test patch stats to catch the
renamed file and provide only diffs ± on that.
v4: Add documentation to UAPI bpf.h
v5: Add documentation to tools UAPI bpf.h
v6: 'git add' test_sockhash_kern.c which was previously missing
but was not causing issues because of typo in test script,
noticed by Daniel. After this the git format-patch -M option
no longer tracks the rename of the test_sockmap_kern files for
some reason. I guess the diff has exceeded some threshold.
====================
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r-- | include/linux/bpf.h | 8 | ||||
-rw-r--r-- | include/linux/bpf_types.h | 1 | ||||
-rw-r--r-- | include/linux/filter.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 54 | ||||
-rw-r--r-- | kernel/bpf/core.c | 1 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 638 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 14 | ||||
-rw-r--r-- | net/core/filter.c | 89 | ||||
-rw-r--r-- | tools/bpf/bpftool/map.c | 1 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 54 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/bpf_helpers.h | 8 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_sockhash_kern.c | 5 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_sockmap.c | 27 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_sockmap_kern.c | 343 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/test_sockmap_kern.h | 363 |
17 files changed, 1161 insertions, 453 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a38e474bf7ee..ed0122b45b63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) @@ -675,6 +676,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } +static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, + void *key) +{ + return NULL; +} + static inline int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) @@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; +extern const struct bpf_func_proto bpf_sock_hash_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d7df1b323082..b67f8793de0d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/filter.h b/include/linux/filter.h index da7e16523128..9dbcb9d55921 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -515,9 +515,8 @@ struct sk_msg_buff { int sg_end; struct scatterlist sg_data[MAX_SKB_FRAGS]; bool sg_copy[MAX_SKB_FRAGS]; - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; struct sk_buff *skb; struct list_head list; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index cf803fe0fb86..059287374ba0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -814,9 +814,8 @@ struct tcp_skb_cb { #endif } header; /* For incoming skbs */ struct { - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; void *data_end; } bpf; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02e4112510f8..d94d333a8225 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1926,7 +1973,10 @@ union bpf_attr { FN(skb_get_xfrm_state), \ FN(get_stack), \ FN(skb_load_bytes_relative), \ - FN(fib_lookup), + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d0d7d9462368..2194c6a9df42 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 098eca568c2b..56879c9fd3a4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -48,14 +48,40 @@ #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; +struct bpf_sock_progs { struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_sock_progs progs; +}; + +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -63,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -191,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -227,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -461,7 +501,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) { return ((_rc == SK_PASS) ? - (md->map ? __SK_REDIRECT : __SK_PASS) : + (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP); } @@ -1092,7 +1132,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) * when we orphan the skb so that we don't have the possibility * to reference a stale map. */ - TCP_SKB_CB(skb)->bpf.map = NULL; + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; bpf_compute_data_pointers(skb); preempt_disable(); @@ -1102,7 +1142,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) /* Moving return codes from UAPI namespace into internal namespace */ return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP; } @@ -1372,7 +1412,6 @@ static int smap_init_sock(struct smap_psock *psock, } static void smap_init_progs(struct smap_psock *psock, - struct bpf_stab *stab, struct bpf_prog *verdict, struct bpf_prog *parse) { @@ -1450,14 +1489,13 @@ static void smap_gc_work(struct work_struct *w) kfree(psock); } -static struct smap_psock *smap_init_psock(struct sock *sock, - struct bpf_stab *stab) +static struct smap_psock *smap_init_psock(struct sock *sock, int node) { struct smap_psock *psock; psock = kzalloc_node(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN, - stab->map.numa_node); + node); if (!psock) return ERR_PTR(-ENOMEM); @@ -1525,12 +1563,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1568,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1627,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1662,40 +1702,26 @@ out: * - sock_map must use READ_ONCE and (cmp)xchg operations * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) + +static int __sock_map_ctx_update_elem(struct bpf_map *map, + struct bpf_sock_progs *progs, + struct sock *sock, + struct sock **map_link, + void *key) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse, *tx_msg; - struct sock *osock, *sock; + struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; - u32 i = *(u32 *)key; bool new = false; int err; - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; - - sock = skops->sk; - /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs * this results in an error. */ - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); - tx_msg = READ_ONCE(stab->bpf_tx_msg); + verdict = READ_ONCE(progs->bpf_verdict); + parse = READ_ONCE(progs->bpf_parse); + tx_msg = READ_ONCE(progs->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -1703,11 +1729,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + verdict = bpf_prog_inc_not_zero(progs->bpf_verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(stab->bpf_parse); + parse = bpf_prog_inc_not_zero(progs->bpf_parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1715,7 +1741,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg); if (IS_ERR(tx_msg)) { if (verdict) bpf_prog_put(verdict); @@ -1748,7 +1774,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } } else { - psock = smap_init_psock(sock, stab); + psock = smap_init_psock(sock, map->numa_node); if (IS_ERR(psock)) { err = PTR_ERR(psock); goto out_progs; @@ -1758,12 +1784,13 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } - e->entry = &stab->sock_map[i]; /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. @@ -1780,7 +1807,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = smap_init_sock(psock, sock); if (err) goto out_free; - smap_init_progs(psock, stab, verdict, parse); + smap_init_progs(psock, verdict, parse); smap_start_sock(psock, sock); } @@ -1789,20 +1816,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * it with. Because we can only have a single set of programs if * old_sock has a strp we can stop it. */ - list_add_tail(&e->list, &psock->maps); - write_unlock_bh(&sock->sk_callback_lock); - - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); - - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); + if (map_link) { + e->entry = map_link; + list_add_tail(&e->list, &psock->maps); } - return 0; + write_unlock_bh(&sock->sk_callback_lock); + return err; out_free: + kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) @@ -1816,23 +1837,73 @@ out_progs: return err; } -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs = &stab->progs; + struct sock *osock, *sock; + u32 i = *(u32 *)key; + int err; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + sock = READ_ONCE(stab->sock_map[i]); + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) + return -EEXIST; + + sock = skops->sk; + err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], + key); + if (err) + goto out; + + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } +out: + return 0; +} + +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +{ + struct bpf_sock_progs *progs; struct bpf_prog *orig; - if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } else { return -EINVAL; + } switch (type) { case BPF_SK_MSG_VERDICT: - orig = xchg(&stab->bpf_tx_msg, prog); + orig = xchg(&progs->bpf_tx_msg, prog); break; case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&stab->bpf_parse, prog); + orig = xchg(&progs->bpf_parse, prog); break; case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&stab->bpf_verdict, prog); + orig = xchg(&progs->bpf_verdict, prog); break; default: return -EOPNOTSUPP; @@ -1880,21 +1951,415 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs; struct bpf_prog *orig; - orig = xchg(&stab->bpf_parse, NULL); + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_verdict, NULL); + orig = xchg(&progs->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_tx_msg, NULL); + orig = xchg(&progs->bpf_tx_msg, NULL); if (orig) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1905,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = { .map_release_uref = sock_map_release, }; +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, +}; + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { @@ -1922,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d92d9c37affd..a9e4b1372da6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index ca60d2872da4..6d0d1560bd70 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -2083,9 +2110,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.key = key; tcb->bpf.flags = flags; - tcb->bpf.map = map; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; return SK_PASS; } @@ -2093,16 +2121,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - struct sock *sk = NULL; - - if (tcb->bpf.map) { - sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - - tcb->bpf.key = 0; - tcb->bpf.map = NULL; - } - return sk; + return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -2115,32 +2135,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->key = key; msg->flags = flags; - msg->map = map; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; return SK_PASS; } -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) { - struct sock *sk = NULL; + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; - if (msg->map) { - sk = __sock_map_lookup_elem(msg->map, msg->key); + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; - msg->key = 0; - msg->map = NULL; - } + return SK_PASS; +} - return sk; +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { @@ -4517,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -4528,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -4559,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index af6766e956ba..097b1a5e046b 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -66,6 +66,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_DEVMAP] = "devmap", [BPF_MAP_TYPE_SOCKMAP] = "sockmap", [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", }; static bool map_is_per_cpu(__u32 type) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 02e4112510f8..d94d333a8225 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1926,7 +1973,10 @@ union bpf_attr { FN(skb_get_xfrm_state), \ FN(get_stack), \ FN(skb_load_bytes_relative), \ - FN(fib_lookup), + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 133ebc68cbe4..1eb0fa2aba92 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \ test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ - test_get_stack_rawtp.o + test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 2375d06c706b..8f143dfb3700 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -75,9 +75,14 @@ static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = (void *) BPF_FUNC_sock_ops_cb_flags_set; static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; +static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) = + (void *) BPF_FUNC_sk_redirect_hash; static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; +static int (*bpf_sock_hash_update)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_sock_hash_update; static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, void *buf, unsigned int buf_size) = (void *) BPF_FUNC_perf_event_read_value; @@ -88,6 +93,9 @@ static int (*bpf_override_return)(void *ctx, unsigned long rc) = (void *) BPF_FUNC_override_return; static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_msg_redirect_map; +static int (*bpf_msg_redirect_hash)(void *ctx, + void *map, void *key, int flags) = + (void *) BPF_FUNC_msg_redirect_hash; static int (*bpf_msg_apply_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_apply_bytes; static int (*bpf_msg_cork_bytes)(void *ctx, int len) = diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/test_sockhash_kern.c new file mode 100644 index 000000000000..e6755916442a --- /dev/null +++ b/tools/testing/selftests/bpf/test_sockhash_kern.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io +#undef SOCKMAP +#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH +#include "./test_sockmap_kern.h" diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 29c022d23f4e..eb17fae458e6 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -47,7 +47,8 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" /* global sockets */ @@ -1260,9 +1261,8 @@ int prog_type[] = { BPF_PROG_TYPE_SK_MSG, }; -static int populate_progs(void) +static int populate_progs(char *bpf_file) { - char *bpf_file = BPF_FILENAME; struct bpf_program *prog; struct bpf_object *obj; int i = 0; @@ -1306,11 +1306,11 @@ static int populate_progs(void) return 0; } -static int test_suite(void) +static int __test_suite(char *bpf_file) { int cg_fd, err; - err = populate_progs(); + err = populate_progs(bpf_file); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; @@ -1347,17 +1347,30 @@ static int test_suite(void) out: printf("Summary: %i PASSED %i FAILED\n", passed, failed); + cleanup_cgroup_environment(); close(cg_fd); return err; } +static int test_suite(void) +{ + int err; + + err = __test_suite(BPF_SOCKMAP_FILENAME); + if (err) + goto out; + err = __test_suite(BPF_SOCKHASH_FILENAME); +out: + return err; +} + int main(int argc, char **argv) { struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; int iov_count = 1, length = 1024, rate = 1; struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; - char *bpf_file = BPF_FILENAME; + char *bpf_file = BPF_SOCKMAP_FILENAME; int test = PING_PONG; if (setrlimit(RLIMIT_MEMLOCK, &r)) { @@ -1438,7 +1451,7 @@ int main(int argc, char **argv) return -1; } - err = populate_progs(); + err = populate_progs(bpf_file); if (err) { fprintf(stderr, "populate program: (%s) %s\n", bpf_file, strerror(errno)); diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/test_sockmap_kern.c index 33de97e2b6b6..677b2ed1cc1e 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.c +++ b/tools/testing/selftests/bpf/test_sockmap_kern.c @@ -1,340 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io -#include <stddef.h> -#include <string.h> -#include <linux/bpf.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/in.h> -#include <linux/udp.h> -#include <linux/tcp.h> -#include <linux/pkt_cls.h> -#include <sys/socket.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" - -/* Sockmap sample program connects a client and a backend together - * using cgroups. - * - * client:X <---> frontend:80 client:X <---> backend:80 - * - * For simplicity we hard code values here and bind 1:1. The hard - * coded values are part of the setup in sockmap.sh script that - * is associated with this BPF program. - * - * The bpf_printk is verbose and prints information as connections - * are established and verdicts are decided. - */ - -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - -struct bpf_map_def SEC("maps") sock_map = { - .type = BPF_MAP_TYPE_SOCKMAP, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 20, -}; - -struct bpf_map_def SEC("maps") sock_map_txmsg = { - .type = BPF_MAP_TYPE_SOCKMAP, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 20, -}; - -struct bpf_map_def SEC("maps") sock_map_redir = { - .type = BPF_MAP_TYPE_SOCKMAP, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 20, -}; - -struct bpf_map_def SEC("maps") sock_apply_bytes = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1 -}; - -struct bpf_map_def SEC("maps") sock_cork_bytes = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1 -}; - -struct bpf_map_def SEC("maps") sock_pull_bytes = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 2 -}; - -struct bpf_map_def SEC("maps") sock_redir_flags = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1 -}; - -struct bpf_map_def SEC("maps") sock_skb_opts = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1 -}; - -SEC("sk_skb1") -int bpf_prog1(struct __sk_buff *skb) -{ - return skb->len; -} - -SEC("sk_skb2") -int bpf_prog2(struct __sk_buff *skb) -{ - __u32 lport = skb->local_port; - __u32 rport = skb->remote_port; - int len, *f, ret, zero = 0; - __u64 flags = 0; - - if (lport == 10000) - ret = 10; - else - ret = 1; - - len = (__u32)skb->data_end - (__u32)skb->data; - f = bpf_map_lookup_elem(&sock_skb_opts, &zero); - if (f && *f) { - ret = 3; - flags = *f; - } - - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); - return bpf_sk_redirect_map(skb, &sock_map, ret, flags); -} - -SEC("sockops") -int bpf_sockmap(struct bpf_sock_ops *skops) -{ - __u32 lport, rport; - int op, err = 0, index, key, ret; - - - op = (int) skops->op; - - switch (op) { - case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: - lport = skops->local_port; - rport = skops->remote_port; - - if (lport == 10000) { - ret = 1; - err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST); - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); - } - break; - case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: - lport = skops->local_port; - rport = skops->remote_port; - - if (bpf_ntohl(rport) == 10001) { - ret = 10; - err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST); - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); - } - break; - default: - break; - } - - return 0; -} - -SEC("sk_msg1") -int bpf_prog4(struct sk_msg_md *msg) -{ - int *bytes, zero = 0, one = 1; - int *start, *end; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); - end = bpf_map_lookup_elem(&sock_pull_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - return SK_PASS; -} - -SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int err1 = -1, err2 = -1, zero = 0, one = 1; - int *bytes, *start, *end, len1, len2; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); - end = bpf_map_lookup_elem(&sock_pull_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") -int bpf_prog6(struct sk_msg_md *msg) -{ - int *bytes, zero = 0, one = 1, key = 0; - int *start, *end, *f; - __u64 flags = 0; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); - end = bpf_map_lookup_elem(&sock_pull_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -} - -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0; - int *f, *bytes, *start, *end, len1, len2; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); - end = bpf_map_lookup_elem(&sock_pull_bytes, &one); - if (start && end) { - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") -int bpf_prog8(struct sk_msg_md *msg) -{ - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int ret = 0, *bytes, zero = 0; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) { - ret = bpf_msg_apply_bytes(msg, *bytes); - if (ret) - return SK_DROP; - } else { - return SK_DROP; - } - return SK_PASS; -} -SEC("sk_msg6") -int bpf_prog9(struct sk_msg_md *msg) -{ - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - int ret = 0, *bytes, zero = 0; - - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) { - if (((__u64)data_end - (__u64)data) >= *bytes) - return SK_PASS; - ret = bpf_msg_cork_bytes(msg, *bytes); - if (ret) - return SK_DROP; - } - return SK_PASS; -} - -SEC("sk_msg7") -int bpf_prog10(struct sk_msg_md *msg) -{ - int *bytes, zero = 0, one = 1; - int *start, *end; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - bpf_msg_cork_bytes(msg, *bytes); - start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); - end = bpf_map_lookup_elem(&sock_pull_bytes, &one); - if (start && end) - bpf_msg_pull_data(msg, *start, *end, 0); - - return SK_DROP; -} - -int _version SEC("version") = 1; -char _license[] SEC("license") = "GPL"; +// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io +#define SOCKMAP +#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKMAP +#include "./test_sockmap_kern.h" diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h new file mode 100644 index 000000000000..8e8e41780bb9 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sockmap_kern.h @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */ +#include <stddef.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/pkt_cls.h> +#include <sys/socket.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" + +/* Sockmap sample program connects a client and a backend together + * using cgroups. + * + * client:X <---> frontend:80 client:X <---> backend:80 + * + * For simplicity we hard code values here and bind 1:1. The hard + * coded values are part of the setup in sockmap.sh script that + * is associated with this BPF program. + * + * The bpf_printk is verbose and prints information as connections + * are established and verdicts are decided. + */ + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +struct bpf_map_def SEC("maps") sock_map = { + .type = TEST_MAP_TYPE, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_map_txmsg = { + .type = TEST_MAP_TYPE, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_map_redir = { + .type = TEST_MAP_TYPE, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_apply_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_cork_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_pull_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2 +}; + +struct bpf_map_def SEC("maps") sock_redir_flags = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_skb_opts = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +SEC("sk_skb1") +int bpf_prog1(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb2") +int bpf_prog2(struct __sk_buff *skb) +{ + __u32 lport = skb->local_port; + __u32 rport = skb->remote_port; + int len, *f, ret, zero = 0; + __u64 flags = 0; + + if (lport == 10000) + ret = 10; + else + ret = 1; + + len = (__u32)skb->data_end - (__u32)skb->data; + f = bpf_map_lookup_elem(&sock_skb_opts, &zero); + if (f && *f) { + ret = 3; + flags = *f; + } + + bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", + len, flags); +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags); +#endif + +} + +SEC("sockops") +int bpf_sockmap(struct bpf_sock_ops *skops) +{ + __u32 lport, rport; + int op, err = 0, index, key, ret; + + + op = (int) skops->op; + + switch (op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (lport == 10000) { + ret = 1; +#ifdef SOCKMAP + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#else + err = bpf_sock_hash_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#endif + bpf_printk("passive(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + lport = skops->local_port; + rport = skops->remote_port; + + if (bpf_ntohl(rport) == 10001) { + ret = 10; +#ifdef SOCKMAP + err = bpf_sock_map_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#else + err = bpf_sock_hash_update(skops, &sock_map, &ret, + BPF_NOEXIST); +#endif + bpf_printk("active(%i -> %i) map ctx update err: %d\n", + lport, bpf_ntohl(rport), err); + } + break; + default: + break; + } + + return 0; +} + +SEC("sk_msg1") +int bpf_prog4(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1; + int *start, *end; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + return SK_PASS; +} + +SEC("sk_msg2") +int bpf_prog5(struct sk_msg_md *msg) +{ + int err1 = -1, err2 = -1, zero = 0, one = 1; + int *bytes, *start, *end, len1, len2; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + int err; + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", + len1, err1, err2); + return SK_PASS; +} + +SEC("sk_msg3") +int bpf_prog6(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1, key = 0; + int *start, *end, *f; + __u64 flags = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } +#ifdef SOCKMAP + return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); +#else + return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); +#endif +} + +SEC("sk_msg4") +int bpf_prog7(struct sk_msg_md *msg) +{ + int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0; + int *f, *bytes, *start, *end, len1, len2; + __u64 flags = 0; + + int err; + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", + len1, flags, err1 ? err1 : err2); +#ifdef SOCKMAP + err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); +#else + err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); +#endif + bpf_printk("sk_msg3: err %i\n", err); + return err; +} + +SEC("sk_msg5") +int bpf_prog8(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) { + ret = bpf_msg_apply_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } else { + return SK_DROP; + } + return SK_PASS; +} +SEC("sk_msg6") +int bpf_prog9(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) { + if (((__u64)data_end - (__u64)data) >= *bytes) + return SK_PASS; + ret = bpf_msg_cork_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } + return SK_PASS; +} + +SEC("sk_msg7") +int bpf_prog10(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1; + int *start, *end; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + + return SK_DROP; +} + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; |