From 2c3682f0be97a5f57c6c8b40fa154dfc77efb461 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:49 -0700 Subject: sock: make static tls function alloc_sg generic sock helper The TLS ULP module builds scatterlists from a sock using page_frag_refill(). This is going to be useful for other ULPs so move it into sock file for more general use. In the process remove useless goto at end of while loop. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/sock.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 27f218bba43f..f68dff0d7bc4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2239,6 +2239,62 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce) +{ + struct page_frag *pfrag; + unsigned int size = *sg_size; + int num_elem = *sg_num_elem, use = 0, rc = 0; + struct scatterlist *sge; + unsigned int orig_offset; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + num_elem - 1; + if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge++; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + ++num_elem; + if (num_elem == MAX_SKB_FRAGS) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } +out: + *sg_size = size; + *sg_num_elem = num_elem; + return rc; +} +EXPORT_SYMBOL(sk_alloc_sg); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) -- cgit v1.2.3 From 8c05dbf04b2882c3c0bc43fe7668c720210877f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:05 -0700 Subject: net: generalize sk_alloc_sg to work with scatterlist rings The current implementation of sk_alloc_sg expects scatterlist to always start at entry 0 and complete at entry MAX_SKB_FRAGS. Future patches will want to support starting at arbitrary offset into scatterlist so add an additional sg_start parameters and then default to the current values in TLS code paths. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/sock.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index f68dff0d7bc4..4f92c2910200 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2240,19 +2240,20 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) EXPORT_SYMBOL(sk_page_frag_refill); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, int first_coalesce) { + int sg_curr = *sg_curr_index, use = 0, rc = 0; + unsigned int size = *sg_curr_size; struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; struct scatterlist *sge; - unsigned int orig_offset; len -= size; pfrag = sk_page_frag(sk); while (len > 0) { + unsigned int orig_offset; + if (!sk_page_frag_refill(sk, pfrag)) { rc = -ENOMEM; goto out; @@ -2270,17 +2271,21 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, orig_offset = pfrag->offset; pfrag->offset += use; - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sge = sg + sg_curr - 1; + if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && sg->offset + sg->length == orig_offset) { sg->length += use; } else { - sge++; + sge = sg + sg_curr; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { + sg_curr++; + + if (sg_curr == MAX_SKB_FRAGS) + sg_curr = 0; + + if (sg_curr == sg_start) { rc = -ENOSPC; break; } @@ -2289,8 +2294,8 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, len -= use; } out: - *sg_size = size; - *sg_num_elem = num_elem; + *sg_curr_size = size; + *sg_curr_index = sg_curr; return rc; } EXPORT_SYMBOL(sk_alloc_sg); -- cgit v1.2.3 From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 33edfa8372fd..2b6c47597eab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3591,6 +3629,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3 From 2a100317c9ebc204a166f16294884fbf9da074ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:15 -0700 Subject: bpf: sockmap, add bpf_msg_apply_bytes() helper A single sendmsg or sendfile system call can contain multiple logical messages that a BPF program may want to read and apply a verdict. But, without an apply_bytes helper any verdict on the data applies to all bytes in the sendmsg/sendfile. Alternatively, a BPF program may only care to read the first N bytes of a msg. If the payload is large say MB or even GB setting up and calling the BPF program repeatedly for all bytes, even though the verdict is already known, creates unnecessary overhead. To allow BPF programs to control how many bytes a given verdict applies to we implement a bpf_msg_apply_bytes() helper. When called from within a BPF program this sets a counter, internal to the BPF infrastructure, that applies the last verdict to the next N bytes. If the N is smaller than the current data being processed from a sendmsg/sendfile call, the first N bytes will be sent and the BPF program will be re-run with start_data pointing to the N+1 byte. If N is larger than the current data being processed the BPF verdict will be applied to multiple sendmsg/sendfile calls until N bytes are consumed. Note1 if a socket closes with apply_bytes counter non-zero this is not a problem because data is not being buffered for N bytes and is sent as its received. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 2b6c47597eab..17d6775f5431 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1928,6 +1928,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 91843d540a139eb8070bcff8aa10089164436deb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:20 -0700 Subject: bpf: sockmap, add msg_cork_bytes() helper In the case where we need a specific number of bytes before a verdict can be assigned, even if the data spans multiple sendmsg or sendfile calls. The BPF program may use msg_cork_bytes(). The extreme case is a user can call sendmsg repeatedly with 1-byte msg segments. Obviously, this is bad for performance but is still valid. If the BPF program needs N bytes to validate a header it can use msg_cork_bytes to specify N bytes and the BPF program will not be called again until N bytes have been accumulated. The infrastructure will attempt to coalesce data if possible so in many cases (most my use cases at least) the data will be in a single scatterlist element with data pointers pointing to start/end of the element. However, this is dependent on available memory so is not guaranteed. So BPF programs must validate data pointer ranges, but this is the case anyways to convince the verifier the accesses are valid. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 17d6775f5431..0c9daf6ee555 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1942,6 +1942,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3650,6 +3664,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 015632bb30daaaee64e1bcac07570860e0bf3092 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:25 -0700 Subject: bpf: sk_msg program helper bpf_sk_msg_pull_data Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/filter.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 0c9daf6ee555..c86f03fd9ea5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3