summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2020-01-15 23:26:14 +0100
committerDaniel Borkmann <daniel@iogearbox.net>2020-01-15 23:26:23 +0100
commit85ddd9c3173102930c16b0cfe8dbb771af434532 (patch)
treebf73100781a6b1416cb714f3d72321f247bf44ea
parent0af2ffc93a4b50948f9dad2786b7f1bd253bf0b9 (diff)
parent7361d44896ff20d48bdd502d1a0cd66308055d45 (diff)
Merge branch 'bpf-sockmap-tls-fixes'
John Fastabend says: ==================== To date our usage of sockmap/tls has been fairly simple, the BPF programs did only well-defined pop, push, pull and apply/cork operations. Now that we started to push more complex programs into sockmap we uncovered a series of issues addressed here. Further OpenSSL3.0 version should be released soon with kTLS support so its important to get any remaining issues on BPF and kTLS support resolved. Additionally, I have a patch under development to allow sockmap to be enabled/disabled at runtime for Cilium endpoints. This allows us to stress the map insert/delete with kTLS more than previously where Cilium only added the socket to the map when it entered ESTABLISHED state and never touched it from the control path side again relying on the sockets own close() hook to remove it. To test I have a set of test cases in test_sockmap.c that expose these issues. Once we get fixes here merged and in bpf-next I'll submit the tests to bpf-next tree to ensure we don't regress again. Also I've run these patches in the Cilium CI with OpenSSL (master branch) this will run tools such as netperf, ab, wrk2, curl, etc. to get a broad set of testing. I'm aware of two more issues that we are working to resolve in another couple (probably two) patches. First we see an auth tag corruption in kTLS when sending small 1byte chunks under stress. I've not pinned this down yet. But, guessing because its under 1B stress tests it must be some error path being triggered. And second we need to ensure BPF RX programs are not skipped when kTLS ULP is loaded. This breaks some of the sockmap selftests when running with kTLS. I'll send a follow up for this. v2: I dropped a patch that added !0 size check in tls_push_record this originated from a panic I caught awhile ago with a trace in the crypto stack. But I can not reproduce it anymore so will dig into that and send another patch later if needed. Anyways after a bit of thought it would be nicer if tls/crypto/bpf didn't require special case handling for the !0 size. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r--include/linux/skmsg.h13
-rw-r--r--include/net/tcp.h6
-rw-r--r--net/core/filter.c11
-rw-r--r--net/core/skmsg.c2
-rw-r--r--net/core/sock_map.c7
-rw-r--r--net/ipv4/tcp_bpf.c5
-rw-r--r--net/ipv4/tcp_ulp.c6
-rw-r--r--net/tls/tls_main.c10
-rw-r--r--net/tls/tls_sw.c31
9 files changed, 66 insertions, 25 deletions
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index ef7031f8a304..14d61bba0b79 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -358,17 +358,22 @@ static inline void sk_psock_update_proto(struct sock *sk,
static inline void sk_psock_restore_proto(struct sock *sk,
struct sk_psock *psock)
{
- sk->sk_write_space = psock->saved_write_space;
+ sk->sk_prot->unhash = psock->saved_unhash;
if (psock->sk_proto) {
struct inet_connection_sock *icsk = inet_csk(sk);
bool has_ulp = !!icsk->icsk_ulp_data;
- if (has_ulp)
- tcp_update_ulp(sk, psock->sk_proto);
- else
+ if (has_ulp) {
+ tcp_update_ulp(sk, psock->sk_proto,
+ psock->saved_write_space);
+ } else {
sk->sk_prot = psock->sk_proto;
+ sk->sk_write_space = psock->saved_write_space;
+ }
psock->sk_proto = NULL;
+ } else {
+ sk->sk_write_space = psock->saved_write_space;
}
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e460ea7f767b..e6f48384dc71 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2147,7 +2147,8 @@ struct tcp_ulp_ops {
/* initialize ulp */
int (*init)(struct sock *sk);
/* update ulp */
- void (*update)(struct sock *sk, struct proto *p);
+ void (*update)(struct sock *sk, struct proto *p,
+ void (*write_space)(struct sock *sk));
/* cleanup ulp */
void (*release)(struct sock *sk);
/* diagnostic */
@@ -2162,7 +2163,8 @@ void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
-void tcp_update_ulp(struct sock *sk, struct proto *p);
+void tcp_update_ulp(struct sock *sk, struct proto *p,
+ void (*write_space)(struct sock *sk));
#define MODULE_ALIAS_TCP_ULP(name) \
__MODULE_INFO(alias, alias_userspace, name); \
diff --git a/net/core/filter.c b/net/core/filter.c
index d22d108fc6e3..538f6a735a19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2231,10 +2231,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += len;
len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
- offset += len;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2346,7 +2346,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
- u32 new, i = 0, l, space, copy = 0, offset = 0;
+ u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
u8 *raw, *to, *from;
struct page *page;
@@ -2356,11 +2356,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2415,6 +2415,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
+ sg_unmark_end(&rsge);
sk_msg_iter_next(msg, end);
}
@@ -2506,7 +2507,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
- u32 i = 0, l, space, offset = 0;
+ u32 i = 0, l = 0, space, offset = 0;
u64 last = start + len;
int pop;
@@ -2516,11 +2517,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ded2d5227678..3866d7e20c07 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -594,6 +594,8 @@ EXPORT_SYMBOL_GPL(sk_psock_destroy);
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
+ sock_owned_by_me(sk);
+
sk_psock_cork_free(psock);
sk_psock_zap_ingress(psock);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index eb114ee419b6..8998e356f423 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -241,8 +241,11 @@ static void sock_map_free(struct bpf_map *map)
struct sock *sk;
sk = xchg(psk, NULL);
- if (sk)
+ if (sk) {
+ lock_sock(sk);
sock_map_unref(sk, psk);
+ release_sock(sk);
+ }
}
raw_spin_unlock_bh(&stab->lock);
rcu_read_unlock();
@@ -862,7 +865,9 @@ static void sock_hash_free(struct bpf_map *map)
raw_spin_lock_bh(&bucket->lock);
hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
hlist_del_rcu(&elem->node);
+ lock_sock(elem->sk);
sock_map_unref(elem->sk, elem);
+ release_sock(elem->sk);
}
raw_spin_unlock_bh(&bucket->lock);
}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index e6b08b5a0895..8a01428f80c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -315,10 +315,7 @@ more_data:
*/
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
- if (msg->sg.size < delta)
- delta -= msg->sg.size;
- else
- delta = 0;
+ delta -= msg->sg.size;
}
if (msg->cork_bytes &&
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 12ab5db2b71c..38d3ad141161 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -99,17 +99,19 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
rcu_read_unlock();
}
-void tcp_update_ulp(struct sock *sk, struct proto *proto)
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+ void (*write_space)(struct sock *sk))
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (!icsk->icsk_ulp_ops) {
+ sk->sk_write_space = write_space;
sk->sk_prot = proto;
return;
}
if (icsk->icsk_ulp_ops->update)
- icsk->icsk_ulp_ops->update(sk, proto);
+ icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
void tcp_cleanup_ulp(struct sock *sk)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index dac24c7aa7d4..94774c0e5ff3 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -732,15 +732,19 @@ out:
return rc;
}
-static void tls_update(struct sock *sk, struct proto *p)
+static void tls_update(struct sock *sk, struct proto *p,
+ void (*write_space)(struct sock *sk))
{
struct tls_context *ctx;
ctx = tls_get_ctx(sk);
- if (likely(ctx))
+ if (likely(ctx)) {
+ ctx->sk_write_space = write_space;
ctx->sk_proto = p;
- else
+ } else {
sk->sk_prot = p;
+ sk->sk_write_space = write_space;
+ }
}
static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index c6803a82b769..159d49dab403 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -682,12 +682,32 @@ static int tls_push_record(struct sock *sk, int flags,
split_point = msg_pl->apply_bytes;
split = split_point && split_point < msg_pl->sg.size;
+ if (unlikely((!split &&
+ msg_pl->sg.size +
+ prot->overhead_size > msg_en->sg.size) ||
+ (split &&
+ split_point +
+ prot->overhead_size > msg_en->sg.size))) {
+ split = true;
+ split_point = msg_en->sg.size;
+ }
if (split) {
rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
split_point, prot->overhead_size,
&orig_end);
if (rc < 0)
return rc;
+ /* This can happen if above tls_split_open_record allocates
+ * a single large encryption buffer instead of two smaller
+ * ones. In this case adjust pointers and continue without
+ * split.
+ */
+ if (!msg_pl->sg.size) {
+ tls_merge_open_record(sk, rec, tmp, orig_end);
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+ split = false;
+ }
sk_msg_trim(sk, msg_en, msg_pl->sg.size +
prot->overhead_size);
}
@@ -709,6 +729,12 @@ static int tls_push_record(struct sock *sk, int flags,
sg_mark_end(sk_msg_elem(msg_pl, i));
}
+ if (msg_pl->sg.end < msg_pl->sg.start) {
+ sg_chain(&msg_pl->sg.data[msg_pl->sg.start],
+ MAX_SKB_FRAGS - msg_pl->sg.start + 1,
+ msg_pl->sg.data);
+ }
+
i = msg_pl->sg.start;
sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
@@ -783,10 +809,7 @@ more_data:
if (psock->eval == __SK_NONE) {
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
- if (delta < msg->sg.size)
- delta -= msg->sg.size;
- else
- delta = 0;
+ delta -= msg->sg.size;
}
if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
!enospc && !full_record) {