From f95f0f95cfb7f180ed7571d4915432d5098df7ec Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:02 +0200 Subject: xdp: Add frame size to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XDP have evolved to support several frame sizes, but xdp_buff was not updated with this information. The frame size (frame_sz) member of xdp_buff is introduced to know the real size of the memory the frame is delivered in. When introducing this also make it clear that some tailroom is reserved/required when creating SKBs using build_skb(). It would also have been an option to introduce a pointer to data_hard_end (with reserved offset). The advantage with frame_sz is that (like rxq) drivers only need to setup/assign this value once per NAPI cycle. Due to XDP-generic (and some drivers) it's not possible to store frame_sz inside xdp_rxq_info, because it's varies per packet as it can be based/depend on packet length. V2: nitpick: deduct -> deduce Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945334261.97035.555255657490688547.stgit@firesoul --- include/net/xdp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 3cc6d5d84aa4..a764af4ae0ea 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,6 +6,8 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ +#include /* skb_shared_info */ + /** * DOC: XDP RX-queue information * @@ -70,8 +72,19 @@ struct xdp_buff { void *data_hard_start; unsigned long handle; struct xdp_rxq_info *rxq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +/* Reserve memory area at end-of data area. + * + * This macro reserves tailroom in the XDP buffer by limiting the + * XDP/BPF data access to data_hard_end. Notice same area (and size) + * is used for XDP_PASS, when constructing the SKB via build_skb(). + */ +#define xdp_data_hard_end(xdp) \ + ((xdp)->data_hard_start + (xdp)->frame_sz - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + struct xdp_frame { void *data; u16 len; -- cgit v1.2.3 From 63fe91ab3d1c5c0b4497b993b8eeaa54f6688d53 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:07 +0200 Subject: bnxt: Add XDP frame size to driver This driver uses full PAGE_SIZE pages when XDP is enabled. In case of XDP uses driver uses __bnxt_alloc_rx_page which does full page DMA-map. Thus, xdp_adjust_tail grow is DMA compliant for XDP_TX action that does DMA-sync. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Reviewed-by: Andy Gospodarek Cc: Michael Chan Cc: Andy Gospodarek Link: https://lore.kernel.org/bpf/158945334769.97035.13437970179897613984.stgit@firesoul --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c6f6f2033880..5e3b4a3b69ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -138,6 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; xdp.rxq = &rxr->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ orig_data = xdp.data; rcu_read_lock(); -- cgit v1.2.3 From 983e43451830742fa93f83656ccbdcb865ea4259 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:12 +0200 Subject: sfc: Add XDP frame size This driver uses RX page-split when possible. It was recently fixed in commit 86e85bf6981c ("sfc: fix XDP-redirect in this driver") to add needed tailroom for XDP-redirect. After the fix efx->rx_page_buf_step is the frame size, with enough head and tail-room for XDP-redirect. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945335278.97035.14611425333184621652.stgit@firesoul --- drivers/net/ethernet/sfc/rx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 260352d97d9d..68c47a8c71df 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -308,6 +308,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + rx_buf->len; xdp.rxq = &rx_queue->xdp_rxq_info; + xdp.frame_sz = efx->rx_page_buf_step; xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); -- cgit v1.2.3 From 494f44d54e25dd79af0ed6734c2d6be0aa0b6d94 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:17 +0200 Subject: mvneta: Add XDP frame size to driver This marvell driver mvneta uses PAGE_SIZE frames, which makes it really easy to convert. Driver updates rxq and now frame_sz once per NAPI call. This driver takes advantage of page_pool PP_FLAG_DMA_SYNC_DEV that can help reduce the number of cache-lines that need to be flushed when doing DMA sync for_device. Due to xdp_adjust_tail can grow the area accessible to the by the CPU (can possibly write into), then max sync length *after* bpf_prog_run_xdp() needs to be taken into account. For XDP_TX action the driver is smart and does DMA-sync. When growing tail this is still safe, because page_pool have DMA-mapped the entire page size. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Lorenzo Bianconi Cc: thomas.petazzoni@bootlin.com Link: https://lore.kernel.org/bpf/158945335786.97035.12714388304493736747.stgit@firesoul --- drivers/net/ethernet/marvell/mvneta.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index e0e9e56830c0..41d2a0eac5fa 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2148,12 +2148,17 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, struct bpf_prog *prog, struct xdp_buff *xdp, struct mvneta_stats *stats) { - unsigned int len; + unsigned int len, sync; + struct page *page; u32 ret, act; len = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; + sync = max(sync, len); + switch (act) { case XDP_PASS: stats->xdp_pass++; @@ -2164,9 +2169,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, err = xdp_do_redirect(pp->dev, xdp, prog); if (unlikely(err)) { ret = MVNETA_XDP_DROPPED; - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); } else { ret = MVNETA_XDP_REDIR; stats->xdp_redirect++; @@ -2175,10 +2179,10 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, } case XDP_TX: ret = mvneta_xdp_xmit_back(pp, xdp); - if (ret != MVNETA_XDP_TX) - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != MVNETA_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); + } break; default: bpf_warn_invalid_xdp_action(act); @@ -2187,8 +2191,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, trace_xdp_exception(pp->dev, prog, act); /* fall through */ case XDP_DROP: - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); ret = MVNETA_XDP_DROPPED; stats->xdp_drop++; break; @@ -2320,6 +2324,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi, rcu_read_lock(); xdp_prog = READ_ONCE(pp->xdp_prog); xdp_buf.rxq = &rxq->xdp_rxq; + xdp_buf.frame_sz = PAGE_SIZE; /* Fairness NAPI loop */ while (rx_proc < budget && rx_proc < rx_todo) { -- cgit v1.2.3 From 495de55f70199bd7ea09079c484283e58bf75c82 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Thu, 14 May 2020 12:49:23 +0200 Subject: net: netsec: Add support for XDP frame size This driver takes advantage of page_pool PP_FLAG_DMA_SYNC_DEV that can help reduce the number of cache-lines that need to be flushed when doing DMA sync for_device. Due to xdp_adjust_tail can grow the area accessible to the by the CPU (can possibly write into), then max sync length *after* bpf_prog_run_xdp() needs to be taken into account. For XDP_TX action the driver is smart and does DMA-sync. When growing tail this is still safe, because page_pool have DMA-mapped the entire page size. Signed-off-by: Ilias Apalodimas Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Lorenzo Bianconi Link: https://lore.kernel.org/bpf/158945336295.97035.15034759661036971024.stgit@firesoul --- drivers/net/ethernet/socionext/netsec.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index a5a0fb60193a..e1f4be4b3d69 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -884,23 +884,28 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; - unsigned int len = xdp->data_end - xdp->data; + unsigned int sync, len = xdp->data_end - xdp->data; u32 ret = NETSEC_XDP_PASS; + struct page *page; int err; u32 act; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - NETSEC_RXBUF_HEADROOM; + sync = max(sync, len); + switch (act) { case XDP_PASS: ret = NETSEC_XDP_PASS; break; case XDP_TX: ret = netsec_xdp_xmit_back(priv, xdp); - if (ret != NETSEC_XDP_TX) - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != NETSEC_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); + } break; case XDP_REDIRECT: err = xdp_do_redirect(priv->ndev, xdp, prog); @@ -908,9 +913,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, ret = NETSEC_XDP_REDIR; } else { ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); } break; default: @@ -921,8 +925,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, /* fall through -- handle aborts by dropping packet */ case XDP_DROP: ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); break; } @@ -936,10 +940,14 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_rx_pkt_info rx_info; enum dma_data_direction dma_dir; struct bpf_prog *xdp_prog; + struct xdp_buff xdp; u16 xdp_xmit = 0; u32 xdp_act = 0; int done = 0; + xdp.rxq = &dring->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; + rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); dma_dir = page_pool_get_dma_dir(dring->page_pool); @@ -953,7 +961,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct sk_buff *skb = NULL; u16 pkt_len, desc_len; dma_addr_t dma_handle; - struct xdp_buff xdp; void *buf_addr; if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD)) { @@ -1002,7 +1009,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + pkt_len; - xdp.rxq = &dring->xdp_rxq; if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); -- cgit v1.2.3 From a075767bbdc659066b89be282c8377fa880e9dc4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:28 +0200 Subject: net: XDP-generic determining XDP frame size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SKB "head" pointer points to the data area that contains skb_shared_info, that can be found via skb_end_pointer(). Given xdp->data_hard_start have been established (basically pointing to skb->head), frame size is between skb_end_pointer() and data_hard_start, plus the size reserved to skb_shared_info. Change the bpf_xdp_adjust_tail offset adjust of skb->len, to be a positive offset number on grow, and negative number on shrink. As this seems more natural when reading the code. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945336804.97035.7164852191163722056.stgit@firesoul --- net/core/dev.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4c91de39890a..f937a3ff668d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ -- cgit v1.2.3 From 34cc0b338a61de3eee3a2bfcaf4f9d6e9fae091a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:33 +0200 Subject: xdp: Xdp_frame add member frame_sz and handle in convert_to_xdp_frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use hole in struct xdp_frame, when adding member frame_sz, which keeps same sizeof struct (32 bytes) Drivers ixgbe and sfc had bug cases where the necessary/expected tailroom was not reserved. This can lead to some hard to catch memory corruption issues. Having the drivers frame_sz this can be detected when packet length/end via xdp->data_end exceed the xdp_data_hard_end pointer, which accounts for the reserved the tailroom. When detecting this driver issue, simply fail the conversion with NULL, which results in feedback to driver (failing xdp_do_redirect()) causing driver to drop packet. Given the lack of consistent XDP stats, this can be hard to troubleshoot. And given this is a driver bug, we want to generate some more noise in form of a WARN stack dump (to ID the driver code that inlined convert_to_xdp_frame). Inlining the WARN macro is problematic, because it adds an asm instruction (on Intel CPUs ud2) what influence instruction cache prefetching. Thus, introduce xdp_warn and macro XDP_WARN, to avoid this and at the same time make identifying the function and line of this inlined function easier. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337313.97035.10015729316710496600.stgit@firesoul --- include/net/xdp.h | 14 +++++++++++++- net/core/xdp.c | 8 ++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index a764af4ae0ea..3094fccf5a88 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -89,7 +89,8 @@ struct xdp_frame { void *data; u16 len; u16 headroom; - u16 metasize; + u32 metasize:8; + u32 frame_sz:24; /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ @@ -104,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +/* Avoids inlining WARN macro in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line); +#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) + struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); /* Convert xdp_buff to xdp_frame */ @@ -124,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return NULL; + /* Catch if driver didn't reserve tailroom for skb_shared_info */ + if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { + XDP_WARN("Driver BUG: missing reserved tailroom"); + return NULL; + } + /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; @@ -131,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; + xdp_frame->frame_sz = xdp->frame_sz; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..490b8f5fa8ee 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); -- cgit v1.2.3 From db612f749e2454c506f20155bba2871f0307d133 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:38 +0200 Subject: xdp: Cpumap redirect use frame_sz and increase skb_tailroom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knowing the memory size backing the packet/xdp_frame data area, and knowing it already have reserved room for skb_shared_info, simplifies using build_skb significantly. With this change we no-longer lie about the SKB truesize, but more importantly a significant larger skb_tailroom is now provided, e.g. when drivers uses a full PAGE_SIZE. This extra tailroom (in linear area) can be used by the network stack when coalescing SKBs (e.g. in skb_try_coalesce, see TCP cases where tcp_queue_rcv() can 'eat' skb). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337822.97035.13557959180460986059.stgit@firesoul --- kernel/bpf/cpumap.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..a71790dab12d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); -- cgit v1.2.3 From 5c8572251fabc5bb49fd623c064e95a9daf6a3e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:43 +0200 Subject: veth: Adjust hard_start offset on redirect XDP frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When native XDP redirect into a veth device, the frame arrives in the xdp_frame structure. It is then processed in veth_xdp_rcv_one(), which can run a new XDP bpf_prog on the packet. Doing so requires converting xdp_frame to xdp_buff, but the tricky part is that xdp_frame memory area is located in the top (data_hard_start) memory area that xdp_buff will point into. The current code tried to protect the xdp_frame area, by assigning xdp_buff.data_hard_start past this memory. This results in 32 bytes less headroom to expand into via BPF-helper bpf_xdp_adjust_head(). This protect step is actually not needed, because BPF-helper bpf_xdp_adjust_head() already reserve this area, and don't allow BPF-prog to expand into it. Thus, it is safe to point data_hard_start directly at xdp_frame memory area. Fixes: 9fc8d518d9d5 ("veth: Handle xdp_frames in xdp napi ring") Reported-by: Mao Wenan Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toshiaki Makita Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945338331.97035.5923525383710752178.stgit@firesoul --- drivers/net/veth.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index aece0e5eec8c..d5691bb84448 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -564,13 +564,15 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, struct veth_stats *stats) { void *hard_start = frame->data - frame->headroom; - void *head = hard_start - sizeof(struct xdp_frame); int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; unsigned int headroom; struct sk_buff *skb; + /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ + hard_start -= sizeof(struct xdp_frame); + rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { @@ -592,7 +594,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); @@ -605,7 +606,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { frame = &orig_frame; @@ -629,7 +629,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(head, headroom, len, 0); + skb = veth_build_skb(hard_start, headroom, len, 0); if (!skb) { xdp_return_frame(frame); stats->rx_drops++; -- cgit v1.2.3 From 45a9e6d8a687e6a0ea6c2f78f15955ae96be4720 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:48 +0200 Subject: veth: Xdp using frame_sz in veth driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The veth driver can run XDP in "native" mode in it's own NAPI handler, and since commit 9fc8d518d9d5 ("veth: Handle xdp_frames in xdp napi ring") packets can come in two forms either xdp_frame or skb, calling respectively veth_xdp_rcv_one() or veth_xdp_rcv_skb(). For packets to arrive in xdp_frame format, they will have been redirected from an XDP native driver. In case of XDP_PASS or no XDP-prog attached, the veth driver will allocate and create an SKB. The current code in veth_xdp_rcv_one() xdp_frame case, had to guess the frame truesize of the incoming xdp_frame, when using veth_build_skb(). With xdp_frame->frame_sz this is not longer necessary. Calculating the frame_sz in veth_xdp_rcv_skb() skb case, is done similar to the XDP-generic handling code in net/core/dev.c. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenzo Bianconi Acked-by: Toke Høiland-Jørgensen Acked-by: Toshiaki Makita Link: https://lore.kernel.org/bpf/158945338840.97035.935897116345700902.stgit@firesoul --- drivers/net/veth.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index d5691bb84448..b586d2fa5551 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -405,10 +405,6 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, { struct sk_buff *skb; - if (!buflen) { - buflen = SKB_DATA_ALIGN(headroom + len) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - } skb = build_skb(head, buflen); if (!skb) return NULL; @@ -583,6 +579,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, xdp.data = frame->data; xdp.data_end = frame->data + frame->len; xdp.data_meta = frame->data - frame->metasize; + xdp.frame_sz = frame->frame_sz; xdp.rxq = &rq->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -629,7 +626,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(hard_start, headroom, len, 0); + skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); if (!skb) { xdp_return_frame(frame); stats->rx_drops++; @@ -695,9 +692,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, goto drop; } - nskb = veth_build_skb(head, - VETH_XDP_HEADROOM + mac_len, skb->len, - PAGE_SIZE); + nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, + skb->len, PAGE_SIZE); if (!nskb) { page_frag_free(head); goto drop; @@ -715,6 +711,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, xdp.data_end = xdp.data + pktlen; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data = xdp.data; orig_data_end = xdp.data_end; @@ -758,6 +759,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, } rcu_read_unlock(); + /* check if bpf_xdp_adjust_head was used */ delta = orig_data - xdp.data; off = mac_len + delta; if (off > 0) @@ -765,9 +767,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, else if (off < 0) __skb_pull(skb, -off); skb->mac_header -= delta; + + /* check if bpf_xdp_adjust_tail was used */ off = xdp.data_end - orig_data_end; if (off != 0) - __skb_put(skb, off); + __skb_put(skb, off); /* positive on grow, negative on shrink */ skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp.data - xdp.data_meta; -- cgit v1.2.3 From 4a9b052a590d6217237502efde7d598156966080 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:53 +0200 Subject: dpaa2-eth: Add XDP frame size The dpaa2-eth driver reserve some headroom used for hardware and software annotation area in RX/TX buffers. Thus, xdp.data_hard_start doesn't start at page boundary. When XDP is configured the area reserved via dpaa2_fd_get_offset(fd) is 448 bytes of which XDP have reserved 256 bytes. As frame_sz is calculated as an offset from xdp_buff.data_hard_start, an adjust from the full PAGE_SIZE == DPAA2_ETH_RX_BUF_RAW_SIZE. When doing XDP_REDIRECT, the driver doesn't need this reserved headroom any-longer and allows xdp_do_redirect() to use it. This is an advantage for the drivers own ndo-xdp_xmit, as it uses part of this headroom for itself. Patch also adjust frame_sz in this case. The driver cannot support XDP data_meta, because it uses the headroom just before xdp.data for struct dpaa2_eth_swa (DPAA2_ETH_SWA_SIZE=64), when transmitting the packet. When transmitting a xdp_frame in dpaa2_eth_xdp_xmit_frame (call via ndo_xdp_xmit) is uses this area to store a pointer to xdp_frame and dma_size, which is used in TX completion (free_tx_fd) to return frame via xdp_return_frame(). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: Ioana Radulescu Link: https://lore.kernel.org/bpf/158945339348.97035.8562488847066908856.stgit@firesoul --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0f3e842a4fd6..8c8d95aa1dfd 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -331,6 +331,9 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, xdp_set_data_meta_invalid(&xdp); xdp.rxq = &ch->xdp_rxq; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - + (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); + xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); /* xdp.data pointer may have changed */ @@ -366,7 +369,11 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, dma_unmap_page(priv->net_dev->dev.parent, addr, DPAA2_ETH_RX_BUF_SIZE, DMA_BIDIRECTIONAL); ch->buf_count--; + + /* Allow redirect use of full headroom */ xdp.data_hard_start = vaddr; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE; + err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); if (unlikely(err)) ch->stats.xdp_drop++; -- cgit v1.2.3 From 7358877ac11041a22ce1cb35c352809051eac48f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:58 +0200 Subject: hv_netvsc: Add XDP frame size to driver The hyperv NIC driver does memory allocation and copy even without XDP. In XDP mode it will allocate a new page for each packet and copy over the payload, before invoking the XDP BPF-prog. The positive thing it that its easy to determine the xdp.frame_sz. The XDP implementation for hv_netvsc transparently passes xdp_prog to the associated VF NIC. Many of the Azure VMs are using SRIOV, so majority of the data are actually processed directly on the VF driver's XDP path. So the overhead of the synthetic data path (hv_netvsc) is minimal. Then XDP is enabled on this driver, XDP_PASS and XDP_TX will create the SKB via build_skb (based on the newly allocated page). Now using XDP frame_sz this will provide more skb_tailroom, which netstack can use for SKB coalescing (e.g tcp_try_coalesce -> skb_try_coalesce). V3: Adjust patch desc to be more positive. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: Wei Liu Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Link: https://lore.kernel.org/bpf/158945339857.97035.10212138582505736163.stgit@firesoul --- drivers/net/hyperv/netvsc_bpf.c | 1 + drivers/net/hyperv/netvsc_drv.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index b86611041db6..1e0c024b0a93 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -49,6 +49,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; + xdp->frame_sz = PAGE_SIZE; xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5de57fc3ec60..6267f706e8ee 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -795,7 +795,7 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, if (xbuf) { unsigned int hdroom = xdp->data - xdp->data_hard_start; unsigned int xlen = xdp->data_end - xdp->data; - unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen); + unsigned int frag_size = xdp->frame_sz; skb = build_skb(xbuf, frag_size); -- cgit v1.2.3 From bc1c5745d77963a4f4684c78cc2b3323900af68b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:03 +0200 Subject: qlogic/qede: Add XDP frame size to driver The driver qede uses a full page, when XDP is enabled. The drivers value in rx_buf_seg_size (struct qede_rx_queue) will be PAGE_SIZE when an XDP bpf_prog is attached. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: Ariel Elior Cc: GR-everest-linux-l2@marvell.com Link: https://lore.kernel.org/bpf/158945340366.97035.7764939691580349618.stgit@firesoul --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 1 + drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index c6c20776b474..7598ebe0962a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1066,6 +1066,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.rxq = &rxq->xdp_rxq; + xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */ /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index f50d9a9b76be..b2d154258b07 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1476,7 +1476,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq) if (rxq->rx_buf_size + size > PAGE_SIZE) rxq->rx_buf_size = PAGE_SIZE - size; - /* Segment size to spilt a page in multiple equal parts , + /* Segment size to split a page in multiple equal parts, * unless XDP is used in which case we'd use the entire page. */ if (!edev->xdp_prog) { -- cgit v1.2.3 From c88c35181d6ab83e439855681032653ef8728045 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:08 +0200 Subject: net: ethernet: ti: Add XDP frame size to driver cpsw The driver code cpsw.c and cpsw_new.c both use page_pool with default order-0 pages or their RX-pages. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Reviewed-by: Grygorii Strashko Cc: Ilias Apalodimas Link: https://lore.kernel.org/bpf/158945340875.97035.752144756428532878.stgit@firesoul --- drivers/net/ethernet/ti/cpsw.c | 1 + drivers/net/ethernet/ti/cpsw_new.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 09f98fa2fb4e..ce0645ada6e7 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -406,6 +406,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index dce49311d3d3..1247d35d42ef 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -348,6 +348,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) -- cgit v1.2.3 From 08fc1cfd2d250be853d33d6505ae11ff52b83b74 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:13 +0200 Subject: ena: Add XDP frame size to amazon NIC driver Frame size ENA_PAGE_SIZE is limited to 16K on systems with larger PAGE_SIZE than 16K. Change ENA_XDP_MAX_MTU to also take into account the reserved tailroom. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Sameeh Jubran Cc: Arthur Kiyanovski Link: https://lore.kernel.org/bpf/158945341384.97035.907403694833419456.stgit@firesoul --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 1 + drivers/net/ethernet/amazon/ena/ena_netdev.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 2818965427e9..85b87ed02dd5 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1606,6 +1606,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, "%s qid %d\n", __func__, rx_ring->qid); res_budget = budget; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = ENA_PAGE_SIZE; do { xdp_verdict = XDP_PASS; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 7df67bf09b93..680099afcccf 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -151,8 +151,9 @@ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE */ -#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ - VLAN_HLEN - XDP_PACKET_HEADROOM) +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \ ((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues)) -- cgit v1.2.3 From d201ea9ebc519fb34ad9ef1f49ab2ab31f5111ea Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:18 +0200 Subject: mlx4: Add XDP frame size and adjust max XDP MTU The mlx4 drivers size of memory backing the RX packet is stored in frag_stride. For XDP mode this will be PAGE_SIZE (normally 4096). For normal mode frag_stride is 2048. Also adjust MLX4_EN_MAX_XDP_MTU to take tailroom into account. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Reviewed-by: Tariq Toukan Cc: Saeed Mahameed Link: https://lore.kernel.org/bpf/158945341893.97035.2688142527052329942.stgit@firesoul --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 43dcbd8214c6..5bd3cd37d50f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,7 +51,8 @@ #include "en_port.h" #define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \ - XDP_PACKET_HEADROOM)) + XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))) int mlx4_en_setup_tc(struct net_device *dev, u8 up) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 787139219813..8a10285b0e10 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -683,6 +683,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud rcu_read_lock(); xdp_prog = rcu_dereference(ring->xdp_prog); xdp.rxq = &ring->xdp_rxq; + xdp.frame_sz = priv->frag_info[0].frag_stride; doorbell_pending = 0; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx -- cgit v1.2.3 From c8145b263dd85f9e589c7c7ba531423d82ca96ae Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:24 +0200 Subject: net: thunderx: Add XDP frame size To help reviewers these are the defines related to RCV_FRAG_LEN #define DMA_BUFFER_LEN 1536 /* In multiples of 128bytes */ #define RCV_FRAG_LEN (SKB_DATA_ALIGN(DMA_BUFFER_LEN + NET_SKB_PAD) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: Sunil Goutham Cc: Robert Richter Link: https://lore.kernel.org/bpf/158945342402.97035.12649844447148990032.stgit@firesoul --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index b4b33368698f..2ba0ce115e63 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -552,6 +552,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM; orig_data = xdp.data; rcu_read_lock(); -- cgit v1.2.3 From fa6540b8efd8944f8627c2f304114663ef4aadc4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:29 +0200 Subject: nfp: Add XDP frame size to netronome driver The netronome nfp driver use PAGE_SIZE when xdp_prog is set, but xdp.data_hard_start begins at offset NFP_NET_RX_BUF_HEADROOM. Thus, adjust for this when setting xdp.frame_sz, as it counts from data_hard_start. When doing XDP_TX this driver is smart and instead of a full DMA-map does a DMA-sync on with packet length. As xdp_adjust_tail can now grow packet length, add checks to make sure that grow size is within the DMA-mapped size. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/158945342911.97035.11214251236208648808.stgit@firesoul --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 9bfb3b077bc1..0e0cc3d58bdc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1741,10 +1741,15 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring, struct nfp_net_rx_buf *rxbuf, unsigned int dma_off, unsigned int pkt_len, bool *completed) { + unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA; struct nfp_net_tx_buf *txbuf; struct nfp_net_tx_desc *txd; int wr_idx; + /* Reject if xdp_adjust_tail grow packet beyond DMA area */ + if (pkt_len + dma_off > dma_map_sz) + return false; + if (unlikely(nfp_net_tx_full(tx_ring, 1))) { if (!*completed) { nfp_net_xdp_complete(tx_ring); @@ -1817,6 +1822,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; + xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM; xdp.rxq = &rx_ring->xdp_rxq; tx_ring = r_vec->xdp_ring; -- cgit v1.2.3 From fb3e6e9307973d2f70a173f1b33d1054fa2b691f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:34 +0200 Subject: tun: Add XDP frame size The tun driver have two code paths for running XDP (bpf_prog_run_xdp). In both cases 'buflen' contains enough tailroom for skb_shared_info. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/bpf/158945343419.97035.9594485183958037621.stgit@firesoul --- drivers/net/tun.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44889eba1dbc..c54f967e2c66 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1671,6 +1671,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &tfile->xdp_rxq; + xdp.frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { @@ -2411,6 +2412,7 @@ static int tun_xdp_one(struct tun_struct *tun, } xdp_set_data_meta_invalid(xdp); xdp->rxq = &tfile->xdp_rxq; + xdp->frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, xdp); err = tun_xdp_act(tun, xdp_prog, xdp, act); -- cgit v1.2.3 From 05afee298afc2f2497b7400b53e9d60fcc24d525 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:39 +0200 Subject: vhost_net: Also populate XDP frame size In vhost_net_build_xdp() the 'buf' that gets queued via an xdp_buff have embedded a struct tun_xdp_hdr (located at xdp->data_hard_start) which contains the buffer length 'buflen' (with tailroom for skb_shared_info). Also storing this buflen in xdp->frame_sz, does not obsolete struct tun_xdp_hdr, as it also contains a struct virtio_net_hdr with other information. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/bpf/158945343928.97035.4620233649151726289.stgit@firesoul --- drivers/vhost/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 2927f02cc7e1..516519dcc8ff 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -747,6 +747,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, xdp->data = buf + pad; xdp->data_end = xdp->data + len; hdr->buflen = buflen; + xdp->frame_sz = buflen; --net->refcnt_bias; alloc_frag->offset += buflen; -- cgit v1.2.3 From 9ce6146ec7b50718fa5ef5287f1d6561b25a5da8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:44 +0200 Subject: virtio_net: Add XDP frame size in two code paths The virtio_net driver is running inside the guest-OS. There are two XDP receive code-paths in virtio_net, namely receive_small() and receive_mergeable(). The receive_big() function does not support XDP. In receive_small() the frame size is available in buflen. The buffer backing these frames are allocated in add_recvbuf_small() with same size, except for the headroom, but tailroom have reserved room for skb_shared_info. The headroom is encoded in ctx pointer as a value. In receive_mergeable() the frame size is more dynamic. There are two basic cases: (1) buffer size is based on a exponentially weighted moving average (see DECLARE_EWMA) of packet length. Or (2) in case virtnet_get_headroom() have any headroom then buffer size is PAGE_SIZE. The ctx pointer is this time used for encoding two values; the buffer len "truesize" and headroom. In case (1) if the rx buffer size is underestimated, the packet will have been split over more buffers (num_buf info in virtio_net_hdr_mrg_rxbuf placed in top of buffer area). If that happens the XDP path does a xdp_linearize_page operation. V3: Adjust frame_sz in receive_mergeable() case, spotted by Jason Wang. The code is really hard to follow, so some hints to reviewers. The receive_mergeable() case gets frames that were allocated in add_recvbuf_mergeable() which uses headroom=virtnet_get_headroom(), and 'buf' ptr is advanced this headroom. The headroom can only be 0 or VIRTIO_XDP_HEADROOM, as virtnet_get_headroom is really simple: static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; } As frame_sz is an offset size from xdp.data_hard_start, reviewers should notice how this is calculated in receive_mergeable(): int offset = buf - page_address(page); [...] data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; The calculated offset will always be VIRTIO_XDP_HEADROOM when reaching this code. Thus, xdp.data_hard_start will be page-start address plus vi->hdr_len. Given this xdp.frame_sz need to be reduced with vi->hdr_len size. IMHO a followup patch should cleanup this code to make it easier to maintain and understand, but it is outside the scope of this patchset. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/bpf/158945344436.97035.9445115070189151680.stgit@firesoul --- drivers/net/virtio_net.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 11f722460513..9e1b5d748586 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -689,6 +689,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_end = xdp.data + len; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = buflen; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -797,10 +798,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; struct bpf_prog *xdp_prog; - unsigned int truesize; + unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); - int err; unsigned int metasize = 0; + unsigned int frame_sz; + int err; head_skb = NULL; stats->bytes += len - vi->hdr_len; @@ -821,6 +823,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Buffers with headroom use PAGE_SIZE as alloc size, + * see add_recvbuf_mergeable() + get_mergeable_buf_len() + */ + frame_sz = headroom ? PAGE_SIZE : truesize; + /* This happens when rx buffer size is underestimated * or headroom is not enough because of the buffer * was refilled before XDP is set. This should only @@ -834,6 +841,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, page, offset, VIRTIO_XDP_HEADROOM, &len); + frame_sz = PAGE_SIZE; + if (!xdp_page) goto err_xdp; offset = VIRTIO_XDP_HEADROOM; @@ -850,6 +859,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = frame_sz - vi->hdr_len; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -924,7 +934,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } rcu_read_unlock(); - truesize = mergeable_ctx_to_truesize(ctx); if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)ctx); -- cgit v1.2.3 From 88eb0ee17b2ece64fcf6689a4557a5c2e7a89c4b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:49 +0200 Subject: ixgbe: Fix XDP redirect on archs with PAGE_SIZE above 4K The ixgbe driver have another memory model when compiled on archs with PAGE_SIZE above 4096 bytes. In this mode it doesn't split the page in two halves, but instead increment rx_buffer->page_offset by truesize of packet (which include headroom and tailroom for skb_shared_info). This is done correctly in ixgbe_build_skb(), but in ixgbe_rx_buffer_flip which is currently only called on XDP_TX and XDP_REDIRECT, it forgets to add the tailroom for skb_shared_info. This breaks XDP_REDIRECT, for veth and cpumap. Fix by adding size of skb_shared_info tailroom. Maintainers notice: This fix have been queued to Jeff. Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: Jeff Kirsher Link: https://lore.kernel.org/bpf/158945344946.97035.17031588499266605743.stgit@firesoul --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 718931d951bc..ea6834bae04c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2254,7 +2254,8 @@ static void ixgbe_rx_buffer_flip(struct ixgbe_ring *rx_ring, rx_buffer->page_offset ^= truesize; #else unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) : + SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : SKB_DATA_ALIGN(size); rx_buffer->page_offset += truesize; -- cgit v1.2.3 From cf02512899805d6f3d48c0cf1825148f5d24fe71 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:54 +0200 Subject: ixgbe: Add XDP frame size to driver This driver uses different memory models depending on PAGE_SIZE at compile time. For PAGE_SIZE 4K it uses page splitting, meaning for normal MTU frame size is 2048 bytes (and headroom 192 bytes). For larger MTUs the driver still use page splitting, by allocating order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than 4K, driver instead advance its rx_buffer->page_offset with the frame size "truesize". For XDP frame size calculations, this mean that in PAGE_SIZE larger than 4K mode the frame_sz change on a per packet basis. For the page split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be updated once outside the main NAPI loop. The default setting in the driver uses build_skb(), which provides the necessary headroom and tailroom for XDP-redirect in RX-frame (in both modes). There is one complication, which is legacy-rx mode (configurable via ethtool priv-flags). There are zero headroom in this mode, which is a requirement for XDP-redirect to work. The conversion to xdp_frame (convert_to_xdp_frame) will detect this insufficient space, and xdp_do_redirect() call will fail. This is deemed acceptable, as it allows other XDP actions to still work in legacy-mode. In legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also accept that xdp_adjust_tail shrink doesn't work. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: intel-wired-lan@lists.osuosl.org Cc: Jeff Kirsher Cc: Alexander Duyck Link: https://lore.kernel.org/bpf/158945345455.97035.14334355929030628741.stgit@firesoul --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 ++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ea6834bae04c..eab5934b04f5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2244,20 +2244,30 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbe_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbe_rx_buffer_flip(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *rx_buffer, unsigned int size) { + unsigned int truesize = ixgbe_rx_frame_truesize(rx_ring, size); #if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2; - rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -2291,6 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *rx_buffer; @@ -2324,7 +2339,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); } -- cgit v1.2.3 From 81f3c6283cff03efae139a85851602a4c1c6bd72 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:50:59 +0200 Subject: ixgbevf: Add XDP frame size to VF driver This patch mirrors the changes to ixgbe in previous patch. This VF driver doesn't support XDP_REDIRECT, but correct tailroom is still necessary for BPF-helper xdp_adjust_tail. In legacy-mode + larger PAGE_SIZE, due to lacking tailroom, we accept that xdp_adjust_tail shrink doesn't work. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: intel-wired-lan@lists.osuosl.org Cc: Jeff Kirsher Cc: Alexander Duyck Link: https://lore.kernel.org/bpf/158945345984.97035.13518286183248025173.stgit@firesoul --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 34 ++++++++++++++++++----- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4622c4ea2e46..a39e2cb384dd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1095,19 +1095,31 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring, struct ixgbevf_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; + unsigned int truesize = ixgbevf_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -1125,6 +1137,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; union ixgbe_adv_rx_desc *rx_desc; @@ -1157,7 +1174,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbevf_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp); } -- cgit v1.2.3 From 24104024ce0553ae7738bb1ea5e6e3ed6619160d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:04 +0200 Subject: i40e: Add XDP frame size to driver This driver uses different memory models depending on PAGE_SIZE at compile time. For PAGE_SIZE 4K it uses page splitting, meaning for normal MTU frame size is 2048 bytes (and headroom 192 bytes). For larger MTUs the driver still use page splitting, by allocating order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than 4K, driver instead advance its rx_buffer->page_offset with the frame size "truesize". For XDP frame size calculations, this mean that in PAGE_SIZE larger than 4K mode the frame_sz change on a per packet basis. For the page split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be updated once outside the main NAPI loop. The default setting in the driver uses build_skb(), which provides the necessary headroom and tailroom for XDP-redirect in RX-frame (in both modes). There is one complication, which is legacy-rx mode (configurable via ethtool priv-flags). There are zero headroom in this mode, which is a requirement for XDP-redirect to work. The conversion to xdp_frame (convert_to_xdp_frame) will detect this insufficient space, and xdp_do_redirect() call will fail. This is deemed acceptable, as it allows other XDP actions to still work in legacy-mode. In legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also accept that xdp_adjust_tail shrink doesn't work. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: intel-wired-lan@lists.osuosl.org Cc: Jeff Kirsher Cc: Alexander Duyck Link: https://lore.kernel.org/bpf/158945346494.97035.12809400414566061815.stgit@firesoul --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 30 ++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b8496037ef7f..a3772beffe02 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1507,6 +1507,22 @@ static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring) return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0; } +static unsigned int i40e_rx_frame_truesize(struct i40e_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = i40e_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = i40e_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring)) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * i40e_alloc_mapped_page - recycle or make a new page * @rx_ring: ring to use @@ -2246,13 +2262,11 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring, struct i40e_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2; + unsigned int truesize = i40e_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size); - rx_buffer->page_offset += truesize; #endif } @@ -2335,6 +2349,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) bool failure = false; struct xdp_buff xdp; +#if (PAGE_SIZE < 8192) + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); +#endif xdp.rxq = &rx_ring->xdp_rxq; while (likely(total_rx_packets < (unsigned int)budget)) { @@ -2389,7 +2406,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size); +#endif skb = i40e_run_xdp(rx_ring, &xdp); } -- cgit v1.2.3 From d4ecdbf7aa2fa4feac09befb04cdaf44e6dc938b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:10 +0200 Subject: ice: Add XDP frame size to driver This driver uses different memory models depending on PAGE_SIZE at compile time. For PAGE_SIZE 4K it uses page splitting, meaning for normal MTU frame size is 2048 bytes (and headroom 192 bytes). For larger MTUs the driver still use page splitting, by allocating order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than 4K, driver instead advance its rx_buffer->page_offset with the frame size "truesize". For XDP frame size calculations, this mean that in PAGE_SIZE larger than 4K mode the frame_sz change on a per packet basis. For the page split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be updated once outside the main NAPI loop. The default setting in the driver uses build_skb(), which provides the necessary headroom and tailroom for XDP-redirect in RX-frame (in both modes). There is one complication, which is legacy-rx mode (configurable via ethtool priv-flags). There are zero headroom in this mode, which is a requirement for XDP-redirect to work. The conversion to xdp_frame (convert_to_xdp_frame) will detect this insufficient space, and xdp_do_redirect() call will fail. This is deemed acceptable, as it allows other XDP actions to still work in legacy-mode. In legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also accept that xdp_adjust_tail shrink doesn't work. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Cc: intel-wired-lan@lists.osuosl.org Cc: Jeff Kirsher Cc: Alexander Duyck Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul --- drivers/net/ethernet/intel/ice/ice_txrx.c | 34 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index f67e8362958c..69b21b436f9a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -423,6 +423,22 @@ static unsigned int ice_rx_offset(struct ice_ring *rx_ring) return 0; } +static unsigned int ice_rx_frame_truesize(struct ice_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ice_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * ice_run_xdp - Executes an XDP program on initialized xdp_buff * @rx_ring: Rx ring @@ -991,6 +1007,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) bool failure; xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0); +#endif /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { @@ -1038,6 +1058,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); +#endif rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); @@ -1051,16 +1075,8 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) if (!xdp_res) goto construct_skb; if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { - unsigned int truesize; - -#if (PAGE_SIZE < 8192) - truesize = ice_rx_pg_size(rx_ring) / 2; -#else - truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + - size); -#endif xdp_xmit |= xdp_res; - ice_rx_buf_adjust_pg_offset(rx_buf, truesize); + ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz); } else { rx_buf->pagecnt_bias++; } -- cgit v1.2.3 From 2a637c5b1aaf3b21418fadffad7e56ff27cee6f7 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:15 +0200 Subject: xdp: For Intel AF_XDP drivers add XDP frame_sz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel drivers implement native AF_XDP zerocopy in separate C-files, that have its own invocation of bpf_prog_run_xdp(). The setup of xdp_buff is also handled in separately from normal code path. This patch update XDP frame_sz for AF_XDP zerocopy drivers i40e, ice and ixgbe, as the code changes needed are very similar. Introduce a helper function xsk_umem_xdp_frame_sz() for calculating frame size. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Cc: intel-wired-lan@lists.osuosl.org Cc: Magnus Karlsson Link: https://lore.kernel.org/bpf/158945347511.97035.8536753731329475655.stgit@firesoul --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 ++ drivers/net/ethernet/intel/ice/ice_xsk.c | 2 ++ drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 ++ include/net/xdp_sock.h | 11 +++++++++++ 4 files changed, 17 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 0b7d29192b2c..2b9184aead5f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -531,12 +531,14 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *bi; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 8279db15e870..23e5515d4527 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -840,11 +840,13 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_xmit = 0; bool failure = false; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 74b540ebb3dc..a656ee9a1fae 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -431,12 +431,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbe_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbe_desc_unused(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 67191ccaab85..abd72de25fa4 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -236,6 +236,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, else return address + offset; } + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return umem->chunk_size_nohr + umem->headroom; +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -366,6 +372,11 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return 0; +} + static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; -- cgit v1.2.3 From d628ee4fef1dbd6f2fa11e3548322c7839319537 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:20 +0200 Subject: mlx5: Rx queue setup time determine frame_sz for XDP The mlx5 driver have multiple memory models, which are also changed according to whether a XDP bpf_prog is attached. The 'rx_striding_rq' setting is adjusted via ethtool priv-flags e.g.: # ethtool --set-priv-flags mlx5p2 rx_striding_rq off On the general case with 4K page_size and regular MTU packet, then the frame_sz is 2048 and 4096 when XDP is enabled, in both modes. The info on the given frame size is stored differently depending on the RQ-mode and encoded in a union in struct mlx5e_rq union wqe/mpwqe. In rx striding mode rq->mpwqe.log_stride_sz is either 11 or 12, which corresponds to 2048 or 4096 (MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ). In non-striding mode (MLX5_WQ_TYPE_CYCLIC) the frag_stride is stored in rq->wqe.info.arr[0].frag_stride, for the first fragment, which is what the XDP case cares about. To reduce effect on fast-path, this patch determine the frame_sz at setup time, to avoid determining the memory model runtime. Variable is named frame0_sz to make it clear that this is only the frame size of the first fragment. This mlx5 driver does a DMA-sync on XDP_TX action, but grow is safe as it have done a DMA-map on the entire PAGE_SIZE. The driver also already does a XDP length check against sq->hw_mtu on the possible XDP xmit paths mlx5e_xmit_xdp_frame() + mlx5e_xmit_xdp_frame_mpwqe(). V3+4: Change variable name first_frame_sz to frame0_sz V2: Fix that frag_size need to be recalc before creating SKB. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Tariq Toukan Cc: Saeed Mahameed Link: https://lore.kernel.org/bpf/158945348021.97035.12295039384250022883.stgit@firesoul --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 ++ 4 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3bd64c63865b..26911b15f8fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -625,6 +625,7 @@ struct mlx5e_rq { struct { u16 umem_headroom; u16 headroom; + u32 frame0_sz; u8 map_dir; /* dma map direction */ } buff; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index c4a7fb4ecd14..761c8979bd41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -137,6 +137,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, if (xsk) xdp.handle = di->xsk.handle; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = rq->buff.frame0_sz; act = bpf_prog_run_xdp(prog, &xdp); if (xsk) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0a9dfc31de3e..0e4ca08ddca9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -462,6 +462,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); + rq->buff.frame0_sz = (1 << rq->mpwqe.log_stride_sz); + err = mlx5e_create_rq_umr_mkey(mdev, rq); if (err) goto err_rq_wq_destroy; @@ -485,6 +487,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; + rq->wqe.frags = kvzalloc_node(array_size(sizeof(*rq->wqe.frags), (wq_sz << rq->wqe.info.log_num_frags)), @@ -522,6 +526,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, } if (xsk) { + rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem); + err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames); if (unlikely(err)) { mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 779600bebcca..821f94beda7a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1070,6 +1070,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, if (consumed) return NULL; /* page/packet was consumed by XDP */ + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); if (unlikely(!skb)) return NULL; @@ -1371,6 +1372,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* page/packet was consumed by XDP */ } + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); if (unlikely(!skb)) return NULL; -- cgit v1.2.3 From c8741e2bfe872425ea6f10bb6f7dc1d67bc60c3a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:25 +0200 Subject: xdp: Allow bpf_xdp_adjust_tail() to grow packet size Finally, after all drivers have a frame size, allow BPF-helper bpf_xdp_adjust_tail() to grow or extend packet size at frame tail. Remember that helper/macro xdp_data_hard_end have reserved some tailroom. Thus, this helper makes sure that the BPF-prog don't have access to this tailroom area. V2: Remove one chicken check and use WARN_ONCE for other Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945348530.97035.12577148209134239291.stgit@firesoul --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..e7b033dad44e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,12 +3411,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; -- cgit v1.2.3 From ddb47d518ca10948d1f64a983cb9274720f691cd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:30 +0200 Subject: xdp: Clear grow memory in bpf_xdp_adjust_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clearing memory of tail when grow happens, because it is too easy to write a XDP_PASS program that extend the tail, which expose this memory to users that can run tcpdump. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945349039.97035.5262100484553494.stgit@firesoul --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index e7b033dad44e..a85eb538d4d6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3427,6 +3427,10 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; -- cgit v1.2.3 From bc56c919fce782f616823b76fb70a788f4762cf5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:35 +0200 Subject: bpf: Add xdp.frame_sz in bpf_prog_test_run_xdp(). Update the memory requirements, when adding xdp.frame_sz in BPF test_run function bpf_prog_test_run_xdp() which e.g. is used by XDP selftests. Specifically add the expected reserved tailroom, but also allocated a larger memory area to reflect that XDP frames usually comes in this format. Limit the provided packet data size to 4096 minus headroom + tailroom, as this also reflect a common 3520 bytes MTU limit with XDP. Note that bpf_test_init already use a memory allocation method that clears memory. Thus, this already guards against leaking uninit kernel memory. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945349549.97035.15316291762482444006.stgit@firesoul --- net/bpf/test_run.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 29dbdd4c29f6..30ba7d38941d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -470,25 +470,34 @@ out: int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + u32 max_data_sz; void *data; int ret; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + if (size > max_data_sz) + return -EINVAL; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) return PTR_ERR(data); xdp.data_hard_start = data; - xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; @@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || - xdp.data_end != xdp.data + size) + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: -- cgit v1.2.3 From 68545fb6f2ff621de26d96a3f15868abfb6897b0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:40 +0200 Subject: selftests/bpf: Adjust BPF selftest for xdp_adjust_tail Current selftest for BPF-helper xdp_adjust_tail only shrink tail. Make it more clear that this is a shrink test case. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945350058.97035.17280775016196207372.stgit@firesoul --- .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 9 +++++-- .../testing/selftests/bpf/progs/test_adjust_tail.c | 30 ---------------------- .../bpf/progs/test_xdp_adjust_tail_shrink.c | 30 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 32 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/test_adjust_tail.c create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 6c8ca1c93f9b..a76dd81dfce9 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -2,9 +2,9 @@ #include #include -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; struct bpf_object *obj; char buf[128]; __u32 duration, retval, size; @@ -28,3 +28,8 @@ void test_xdp_adjust_tail(void) err, errno, retval, size); bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + test_xdp_adjust_tail_shrink(); +} diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_adjust_tail.c deleted file mode 100644 index b7fc85769bdc..000000000000 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include -#include -#include - -int _version SEC("version") = 1; - -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) -{ - void *data_end = (void *)(long)xdp->data_end; - void *data = (void *)(long)xdp->data; - int offset = 0; - - if (data_end - data == 54) - offset = 256; - else - offset = 20; - if (bpf_xdp_adjust_tail(xdp, 0 - offset)) - return XDP_DROP; - return XDP_TX; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c new file mode 100644 index 000000000000..22065a9cfb25 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include + +int _version SEC("version") = 1; + +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + int offset = 0; + + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ + else + offset = 20; + if (bpf_xdp_adjust_tail(xdp, 0 - offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 7ae2e00e8fc23f10169079fadd388317d81012be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:45 +0200 Subject: selftests/bpf: Xdp_adjust_tail add grow tail tests Extend BPF selftest xdp_adjust_tail with grow tail tests, which is added as subtest's. The first grow test stays in same form as original shrink test. The second grow test use the newer bpf_prog_test_run_xattr() calls, and does extra checking of data contents. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945350567.97035.9632611946765811876.stgit@firesoul --- .../selftests/bpf/prog_tests/xdp_adjust_tail.c | 116 ++++++++++++++++++++- .../bpf/progs/test_xdp_adjust_tail_grow.c | 33 ++++++ 2 files changed, 144 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index a76dd81dfce9..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -5,10 +5,10 @@ void test_xdp_adjust_tail_shrink(void) { const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -21,15 +21,121 @@ void test_xdp_adjust_tail_shrink(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } void test_xdp_adjust_tail(void) { - test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 0ee52c0f6c67e187ff1906f6048af7c96df320c7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 13 May 2020 09:58:49 +0200 Subject: bpf, bpftool: Allow probing for CONFIG_HZ from kernel config In Cilium we've recently switched to make use of bpf_jiffies64() for parts of our tc and XDP datapath since bpf_ktime_get_ns() is more expensive and high-precision is not needed for our timeouts we have anyway. Our agent has a probe manager which picks up the json of bpftool's feature probe and we also use the macro output in our C programs e.g. to have workarounds when helpers are not available on older kernels. Extend the kernel config info dump to also include the kernel's CONFIG_HZ, and rework the probe_kernel_image_config() for allowing a macro dump such that CONFIG_HZ can be propagated to BPF C code as a simple define if available via config. Latter allows to have _compile- time_ resolution of jiffies <-> sec conversion in our code since all are propagated as known constants. Given we cannot generally assume availability of kconfig everywhere, we also have a kernel hz probe [0] as a fallback. Potentially, bpftool could have an integrated probe fallback as well, although to derive it, we might need to place it under 'bpftool feature probe full' or similar given it would slow down the probing process overall. Yet 'full' doesn't fit either for us since we don't want to pollute the kernel log with warning messages from bpf_probe_write_user() and bpf_trace_printk() on agent startup; I've left it out for the time being. [0] https://github.com/cilium/cilium/blob/master/bpf/cilium-probe-kernel-hz.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200513075849.20868-1-daniel@iogearbox.net --- tools/bpf/bpftool/feature.c | 120 +++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 53 deletions(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index f54347f55ee0..1b73e63274b5 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -80,13 +80,12 @@ print_bool_feature(const char *feat_name, const char *plain_name, printf("%s is %savailable\n", plain_name, res ? "" : "NOT "); } -static void print_kernel_option(const char *name, const char *value) +static void print_kernel_option(const char *name, const char *value, + const char *define_prefix) { char *endptr; int res; - /* No support for C-style ouptut */ - if (json_output) { if (!value) { jsonw_null_field(json_wtr, name); @@ -98,6 +97,12 @@ static void print_kernel_option(const char *name, const char *value) jsonw_int_field(json_wtr, name, res); else jsonw_string_field(json_wtr, name, value); + } else if (define_prefix) { + if (value) + printf("#define %s%s %s\n", define_prefix, + name, value); + else + printf("/* %s%s is not set */\n", define_prefix, name); } else { if (value) printf("%s is set to %s\n", name, value); @@ -315,77 +320,84 @@ static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, return false; } -static void probe_kernel_image_config(void) +static void probe_kernel_image_config(const char *define_prefix) { - static const char * const options[] = { + static const struct { + const char * const name; + bool macro_dump; + } options[] = { /* Enable BPF */ - "CONFIG_BPF", + { "CONFIG_BPF", }, /* Enable bpf() syscall */ - "CONFIG_BPF_SYSCALL", + { "CONFIG_BPF_SYSCALL", }, /* Does selected architecture support eBPF JIT compiler */ - "CONFIG_HAVE_EBPF_JIT", + { "CONFIG_HAVE_EBPF_JIT", }, /* Compile eBPF JIT compiler */ - "CONFIG_BPF_JIT", + { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ - "CONFIG_BPF_JIT_ALWAYS_ON", + { "CONFIG_BPF_JIT_ALWAYS_ON", }, /* cgroups */ - "CONFIG_CGROUPS", + { "CONFIG_CGROUPS", }, /* BPF programs attached to cgroups */ - "CONFIG_CGROUP_BPF", + { "CONFIG_CGROUP_BPF", }, /* bpf_get_cgroup_classid() helper */ - "CONFIG_CGROUP_NET_CLASSID", + { "CONFIG_CGROUP_NET_CLASSID", }, /* bpf_skb_{,ancestor_}cgroup_id() helpers */ - "CONFIG_SOCK_CGROUP_DATA", + { "CONFIG_SOCK_CGROUP_DATA", }, /* Tracing: attach BPF to kprobes, tracepoints, etc. */ - "CONFIG_BPF_EVENTS", + { "CONFIG_BPF_EVENTS", }, /* Kprobes */ - "CONFIG_KPROBE_EVENTS", + { "CONFIG_KPROBE_EVENTS", }, /* Uprobes */ - "CONFIG_UPROBE_EVENTS", + { "CONFIG_UPROBE_EVENTS", }, /* Tracepoints */ - "CONFIG_TRACING", + { "CONFIG_TRACING", }, /* Syscall tracepoints */ - "CONFIG_FTRACE_SYSCALLS", + { "CONFIG_FTRACE_SYSCALLS", }, /* bpf_override_return() helper support for selected arch */ - "CONFIG_FUNCTION_ERROR_INJECTION", + { "CONFIG_FUNCTION_ERROR_INJECTION", }, /* bpf_override_return() helper */ - "CONFIG_BPF_KPROBE_OVERRIDE", + { "CONFIG_BPF_KPROBE_OVERRIDE", }, /* Network */ - "CONFIG_NET", + { "CONFIG_NET", }, /* AF_XDP sockets */ - "CONFIG_XDP_SOCKETS", + { "CONFIG_XDP_SOCKETS", }, /* BPF_PROG_TYPE_LWT_* and related helpers */ - "CONFIG_LWTUNNEL_BPF", + { "CONFIG_LWTUNNEL_BPF", }, /* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */ - "CONFIG_NET_ACT_BPF", + { "CONFIG_NET_ACT_BPF", }, /* BPF_PROG_TYPE_SCHED_CLS, TC filters */ - "CONFIG_NET_CLS_BPF", + { "CONFIG_NET_CLS_BPF", }, /* TC clsact qdisc */ - "CONFIG_NET_CLS_ACT", + { "CONFIG_NET_CLS_ACT", }, /* Ingress filtering with TC */ - "CONFIG_NET_SCH_INGRESS", + { "CONFIG_NET_SCH_INGRESS", }, /* bpf_skb_get_xfrm_state() helper */ - "CONFIG_XFRM", + { "CONFIG_XFRM", }, /* bpf_get_route_realm() helper */ - "CONFIG_IP_ROUTE_CLASSID", + { "CONFIG_IP_ROUTE_CLASSID", }, /* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */ - "CONFIG_IPV6_SEG6_BPF", + { "CONFIG_IPV6_SEG6_BPF", }, /* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */ - "CONFIG_BPF_LIRC_MODE2", + { "CONFIG_BPF_LIRC_MODE2", }, /* BPF stream parser and BPF socket maps */ - "CONFIG_BPF_STREAM_PARSER", + { "CONFIG_BPF_STREAM_PARSER", }, /* xt_bpf module for passing BPF programs to netfilter */ - "CONFIG_NETFILTER_XT_MATCH_BPF", + { "CONFIG_NETFILTER_XT_MATCH_BPF", }, /* bpfilter back-end for iptables */ - "CONFIG_BPFILTER", + { "CONFIG_BPFILTER", }, /* bpftilter module with "user mode helper" */ - "CONFIG_BPFILTER_UMH", + { "CONFIG_BPFILTER_UMH", }, /* test_bpf module for BPF tests */ - "CONFIG_TEST_BPF", + { "CONFIG_TEST_BPF", }, + + /* Misc configs useful in BPF C programs */ + /* jiffies <-> sec conversion for bpf_jiffies64() helper */ + { "CONFIG_HZ", true, } }; char *values[ARRAY_SIZE(options)] = { }; struct utsname utsn; @@ -427,7 +439,8 @@ static void probe_kernel_image_config(void) while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { for (i = 0; i < ARRAY_SIZE(options); i++) { - if (values[i] || strcmp(buf, options[i])) + if ((define_prefix && !options[i].macro_dump) || + values[i] || strcmp(buf, options[i].name)) continue; values[i] = strdup(value); @@ -439,7 +452,9 @@ end_parse: gzclose(file); for (i = 0; i < ARRAY_SIZE(options); i++) { - print_kernel_option(options[i], values[i]); + if (define_prefix && !options[i].macro_dump) + continue; + print_kernel_option(options[i].name, values[i], define_prefix); free(values[i]); } } @@ -632,23 +647,22 @@ section_system_config(enum probe_component target, const char *define_prefix) switch (target) { case COMPONENT_KERNEL: case COMPONENT_UNSPEC: - if (define_prefix) - break; - print_start_section("system_config", "Scanning system configuration...", - NULL, /* define_comment never used here */ - NULL); /* define_prefix always NULL here */ - if (check_procfs()) { - probe_unprivileged_disabled(); - probe_jit_enable(); - probe_jit_harden(); - probe_jit_kallsyms(); - probe_jit_limit(); - } else { - p_info("/* procfs not mounted, skipping related probes */"); + "/*** Misc kernel config items ***/", + define_prefix); + if (!define_prefix) { + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } } - probe_kernel_image_config(); + probe_kernel_image_config(define_prefix); print_end_section(); break; default: -- cgit v1.2.3 From a17b53c4a4b55ec322c132b6670743612229ee9c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:53 -0700 Subject: bpf, capability: Introduce CAP_BPF Split BPF operations that are allowed under CAP_SYS_ADMIN into combination of CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN. For backward compatibility include them in CAP_SYS_ADMIN as well. The end result provides simple safety model for applications that use BPF: - to load tracing program types BPF_PROG_TYPE_{KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT, etc} use CAP_BPF and CAP_PERFMON - to load networking program types BPF_PROG_TYPE_{SCHED_CLS, XDP, SK_SKB, etc} use CAP_BPF and CAP_NET_ADMIN There are few exceptions from this rule: - bpf_trace_printk() is allowed in networking programs, but it's using tracing mechanism, hence this helper needs additional CAP_PERFMON if networking program is using this helper. - BPF_F_ZERO_SEED flag for hash/lru map is allowed under CAP_SYS_ADMIN only to discourage production use. - BPF HW offload is allowed under CAP_SYS_ADMIN. - bpf_probe_write_user() is allowed under CAP_SYS_ADMIN only. CAPs are not checked at attach/detach time with two exceptions: - loading BPF_PROG_TYPE_CGROUP_SKB is allowed for unprivileged users, hence CAP_NET_ADMIN is required at attach time. - flow_dissector detach doesn't check prog FD at detach, hence CAP_NET_ADMIN is required at detach time. CAP_SYS_ADMIN is required to iterate BPF objects (progs, maps, links) via get_next_id command and convert them to file descriptor via GET_FD_BY_ID command. This restriction guarantees that mutliple tasks with CAP_BPF are not able to affect each other. That leads to clean isolation of tasks. For example: task A with CAP_BPF and CAP_NET_ADMIN loads and attaches a firewall via bpf_link. task B with the same capabilities cannot detach that firewall unless task A explicitly passed link FD to task B via scm_rights or bpffs. CAP_SYS_ADMIN can still detach/unload everything. Two networking user apps with CAP_SYS_ADMIN and CAP_NET_ADMIN can accidentely mess with each other programs and maps. Two networking user apps with CAP_NET_ADMIN and CAP_BPF cannot affect each other. CAP_NET_ADMIN + CAP_BPF allows networking programs access only packet data. Such networking progs cannot access arbitrary kernel memory or leak pointers. bpftool, bpftrace, bcc tools binaries should NOT be installed with CAP_BPF and CAP_PERFMON, since unpriv users will be able to read kernel secrets. But users with these two permissions will be able to use these tracing tools. CAP_PERFMON is least secure, since it allows kprobes and kernel memory access. CAP_NET_ADMIN can stop network traffic via iproute2. CAP_BPF is the safest from security point of view and harmless on its own. Having CAP_BPF and/or CAP_NET_ADMIN is not enough to write into arbitrary map and if that map is used by firewall-like bpf prog. CAP_BPF allows many bpf prog_load commands in parallel. The verifier may consume large amount of memory and significantly slow down the system. Existing unprivileged BPF operations are not affected. In particular unprivileged users are allowed to load socket_filter and cg_skb program types and to create array, hash, prog_array, map-in-map map types. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-2-alexei.starovoitov@gmail.com --- include/linux/capability.h | 5 +++++ include/uapi/linux/capability.h | 34 +++++++++++++++++++++++++++++++++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index 027d7e4a853b..b4345b38a6be 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -256,6 +256,11 @@ static inline bool perfmon_capable(void) return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } +static inline bool bpf_capable(void) +{ + return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index e58c9636741b..c7372180a0a9 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -274,6 +274,7 @@ struct vfs_ns_cap_data { arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ +/* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */ #define CAP_SYS_ADMIN 21 @@ -374,7 +375,38 @@ struct vfs_ns_cap_data { #define CAP_PERFMON 38 -#define CAP_LAST_CAP CAP_PERFMON +/* + * CAP_BPF allows the following BPF operations: + * - Creating all types of BPF maps + * - Advanced verifier features + * - Indirect variable access + * - Bounded loops + * - BPF to BPF function calls + * - Scalar precision tracking + * - Larger complexity limits + * - Dead code elimination + * - And potentially other features + * - Loading BPF Type Format (BTF) data + * - Retrieve xlated and JITed code of BPF programs + * - Use bpf_spin_lock() helper + * + * CAP_PERFMON relaxes the verifier checks further: + * - BPF progs can use of pointer-to-integer conversions + * - speculation attack hardening measures are bypassed + * - bpf_probe_read to read arbitrary kernel memory is allowed + * - bpf_trace_printk to print kernel memory is allowed + * + * CAP_SYS_ADMIN is required to use bpf_probe_write_user. + * + * CAP_SYS_ADMIN is required to iterate system wide loaded + * programs, maps, links, BTFs and convert their IDs to file descriptors. + * + * CAP_PERFMON and CAP_BPF are required to load tracing programs. + * CAP_NET_ADMIN and CAP_BPF are required to load networking programs. + */ +#define CAP_BPF 39 + +#define CAP_LAST_CAP CAP_BPF #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index d233ab3f1533..98e1513b608a 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read", "perfmon" + "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf" -#if CAP_LAST_CAP > CAP_PERFMON +#if CAP_LAST_CAP > CAP_BPF #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 18 ++++++++- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ net/core/bpf_sk_storage.c | 4 +- net/core/filter.c | 4 +- 19 files changed, 134 insertions(+), 60 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/filter.c b/net/core/filter.c index a85eb538d4d6..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: -- cgit v1.2.3 From 81626001187609b9c49696a5b48d5abcf0e5f9be Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:55 -0700 Subject: selftests/bpf: Use CAP_BPF and CAP_PERFMON in tests Make all test_verifier test exercise CAP_BPF and CAP_PERFMON Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-4-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 44 +++++++++++++++++++----- tools/testing/selftests/bpf/verifier/calls.c | 16 ++++----- tools/testing/selftests/bpf/verifier/dead_code.c | 10 +++--- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 21a1ce219c1c..78a6bae56ea6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -818,10 +818,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */ + const cap_value_t cap_net_admin = CAP_NET_ADMIN; + const cap_value_t cap_sys_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc(); @@ -829,11 +837,26 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON, + * so update effective bits manually + */ + if (admin) { + cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32); + cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32); + } else { + cap->data[1].effective &= ~(1 << (38 - 32)); + cap->data[1].effective &= ~(1 << (39 - 32)); + } if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out; @@ -1067,9 +1090,11 @@ fail_log: static bool is_admin(void) { + cap_flag_value_t net_priv = CAP_CLEAR; + bool perfmon_priv = false; + bool bpf_priv = false; + struct libcap *cap; cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { @@ -1082,11 +1107,14 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32)); + perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv && perfmon_priv && net_priv == CAP_SET; } static void get_unpriv_disabled() diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 2d752c4f8d9d..7629a0cebb9b 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -315,7 +315,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = POINTER_VALUE, @@ -346,7 +346,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1064,7 +1064,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .errstr = "R0 !read_ok", .result = REJECT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..5cf361d8eb1c 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, -- cgit v1.2.3