summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-04-11 16:39:28 -0700
committerDavid S. Miller <davem@davemloft.net>2021-04-11 16:39:28 -0700
commit23cfa4d4aa9fa634e43edd7f92cde4f050b4f2db (patch)
tree155c5d55bcb73a63e71dcd8ca96807de13ac61bd
parent7dc85b599ae17fb705ffae1b7321ace4b3056aeb (diff)
parent1c3cadbe02420e6c85251c416a78a16f17761231 (diff)
Merge branch 'veth-gro'
Paolo Abeni says: ==================== veth: allow GRO even without XDP This series allows the user-space to enable GRO/NAPI on a veth device even without attaching an XDP program. It does not change the default veth behavior (no NAPI, no GRO), except that the GRO feature bit on top of this series will be effectively off by default on veth devices. Note that currently the GRO bit is on by default, but GRO never takes place in absence of XDP. On top of this series, setting the GRO feature bit enables NAPI and allows the GRO to take place. The TSO features on the peer device are preserved. The main goal is improving UDP forwarding performances for containers in a typical virtual network setup: (container) veth -> veth peer -> bridge/ovs -> vxlan -> NIC Enabling the NAPI threaded mode, GRO the NETIF_F_GRO_UDP_FWD feature on the veth peer improves the UDP stream performance with not void netfilter configuration by 2x factor with no measurable overhead for TCP traffic: some heuristic ensures that TCP will not go through the additional NAPI/GRO layer. Some self-tests are added to check the expected behavior in the default configuration, with XDP and with plain GRO enabled. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/veth.c152
-rw-r--r--tools/testing/selftests/net/Makefile1
-rwxr-xr-xtools/testing/selftests/net/veth.sh177
3 files changed, 316 insertions, 14 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 9e525646df1d..15b2e3923c47 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -57,6 +57,7 @@ struct veth_rq_stats {
struct veth_rq {
struct napi_struct xdp_napi;
+ struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
struct net_device *dev;
struct bpf_prog __rcu *xdp_prog;
struct xdp_mem_info xdp_mem;
@@ -293,13 +294,32 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
netif_rx(skb);
}
+/* return true if the specified skb has chances of GRO aggregation
+ * Don't strive for accuracy, but try to avoid GRO overhead in the most
+ * common scenarios.
+ * When XDP is enabled, all traffic is considered eligible, as the xmit
+ * device has TSO off.
+ * When TSO is enabled on the xmit device, we are likely interested only
+ * in UDP aggregation, explicitly check for that if the skb is suspected
+ * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
+ * to belong to locally generated UDP traffic.
+ */
+static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
+ const struct net_device *rcv,
+ const struct sk_buff *skb)
+{
+ return !(dev->features & NETIF_F_ALL_TSO) ||
+ (skb->destructor == sock_wfree &&
+ rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
+}
+
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct veth_rq *rq = NULL;
struct net_device *rcv;
int length = skb->len;
- bool rcv_xdp = false;
+ bool use_napi = false;
int rxq;
rcu_read_lock();
@@ -313,20 +333,26 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
rxq = skb_get_queue_mapping(skb);
if (rxq < rcv->real_num_rx_queues) {
rq = &rcv_priv->rq[rxq];
- rcv_xdp = rcu_access_pointer(rq->xdp_prog);
+
+ /* The napi pointer is available when an XDP program is
+ * attached or when GRO is enabled
+ * Don't bother with napi/GRO if the skb can't be aggregated
+ */
+ use_napi = rcu_access_pointer(rq->napi) &&
+ veth_skb_is_eligible_for_gro(dev, rcv, skb);
skb_record_rx_queue(skb, rxq);
}
skb_tx_timestamp(skb);
- if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
- if (!rcv_xdp)
+ if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
+ if (!use_napi)
dev_lstats_add(dev, length);
} else {
drop:
atomic64_inc(&priv->dropped);
}
- if (rcv_xdp)
+ if (use_napi)
__veth_xdp_flush(rq);
rcu_read_unlock();
@@ -686,7 +712,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
int mac_len, delta, off;
struct xdp_buff xdp;
- skb_orphan(skb);
+ skb_orphan_partial(skb);
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -903,7 +929,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
-static int veth_napi_add(struct net_device *dev)
+static int __veth_napi_enable(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
@@ -920,6 +946,7 @@ static int veth_napi_add(struct net_device *dev)
struct veth_rq *rq = &priv->rq[i];
napi_enable(&rq->xdp_napi);
+ rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
return 0;
@@ -938,6 +965,7 @@ static void veth_napi_del(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
+ rcu_assign_pointer(priv->rq[i].napi, NULL);
napi_disable(&rq->xdp_napi);
__netif_napi_del(&rq->xdp_napi);
}
@@ -951,8 +979,14 @@ static void veth_napi_del(struct net_device *dev)
}
}
+static bool veth_gro_requested(const struct net_device *dev)
+{
+ return !!(dev->wanted_features & NETIF_F_GRO);
+}
+
static int veth_enable_xdp(struct net_device *dev)
{
+ bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
struct veth_priv *priv = netdev_priv(dev);
int err, i;
@@ -960,7 +994,8 @@ static int veth_enable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
- netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+ if (!napi_already_on)
+ netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
@@ -975,13 +1010,25 @@ static int veth_enable_xdp(struct net_device *dev)
rq->xdp_mem = rq->xdp_rxq.mem;
}
- err = veth_napi_add(dev);
- if (err)
- goto err_rxq_reg;
+ if (!napi_already_on) {
+ err = __veth_napi_enable(dev);
+ if (err)
+ goto err_rxq_reg;
+
+ if (!veth_gro_requested(dev)) {
+ /* user-space did not require GRO, but adding XDP
+ * is supposed to get GRO working
+ */
+ dev->features |= NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ }
}
- for (i = 0; i < dev->real_num_rx_queues; i++)
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
+ rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
+ }
return 0;
err_reg_mem:
@@ -991,7 +1038,8 @@ err_rxq_reg:
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
- netif_napi_del(&rq->xdp_napi);
+ if (!napi_already_on)
+ netif_napi_del(&rq->xdp_napi);
}
return err;
@@ -1004,7 +1052,19 @@ static void veth_disable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++)
rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
- veth_napi_del(dev);
+
+ if (!netif_running(dev) || !veth_gro_requested(dev)) {
+ veth_napi_del(dev);
+
+ /* if user-space did not require GRO, since adding XDP
+ * enabled it, clear it now
+ */
+ if (!veth_gro_requested(dev) && netif_running(dev)) {
+ dev->features &= ~NETIF_F_GRO;
+ netdev_features_change(dev);
+ }
+ }
+
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
@@ -1013,6 +1073,29 @@ static void veth_disable_xdp(struct net_device *dev)
}
}
+static int veth_napi_enable(struct net_device *dev)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+ int err, i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct veth_rq *rq = &priv->rq[i];
+
+ netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
+ }
+
+ err = __veth_napi_enable(dev);
+ if (err) {
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct veth_rq *rq = &priv->rq[i];
+
+ netif_napi_del(&rq->xdp_napi);
+ }
+ return err;
+ }
+ return err;
+}
+
static int veth_open(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
@@ -1026,6 +1109,10 @@ static int veth_open(struct net_device *dev)
err = veth_enable_xdp(dev);
if (err)
return err;
+ } else if (veth_gro_requested(dev)) {
+ err = veth_napi_enable(dev);
+ if (err)
+ return err;
}
if (peer->flags & IFF_UP) {
@@ -1047,6 +1134,8 @@ static int veth_close(struct net_device *dev)
if (priv->_xdp_prog)
veth_disable_xdp(dev);
+ else if (veth_gro_requested(dev))
+ veth_napi_del(dev);
return 0;
}
@@ -1145,10 +1234,32 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
if (peer_priv->_xdp_prog)
features &= ~NETIF_F_GSO_SOFTWARE;
}
+ if (priv->_xdp_prog)
+ features |= NETIF_F_GRO;
return features;
}
+static int veth_set_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ netdev_features_t changed = features ^ dev->features;
+ struct veth_priv *priv = netdev_priv(dev);
+ int err;
+
+ if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
+ return 0;
+
+ if (features & NETIF_F_GRO) {
+ err = veth_napi_enable(dev);
+ if (err)
+ return err;
+ } else {
+ veth_napi_del(dev);
+ }
+ return 0;
+}
+
static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
{
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
@@ -1267,6 +1378,7 @@ static const struct net_device_ops veth_netdev_ops = {
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_fix_features = veth_fix_features,
+ .ndo_set_features = veth_set_features,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp,
@@ -1329,6 +1441,13 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
static struct rtnl_link_ops veth_link_ops;
+static void veth_disable_gro(struct net_device *dev)
+{
+ dev->features &= ~NETIF_F_GRO;
+ dev->wanted_features &= ~NETIF_F_GRO;
+ netdev_update_features(dev);
+}
+
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@@ -1401,6 +1520,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
if (err < 0)
goto err_register_peer;
+ /* keep GRO disabled by default to be consistent with the established
+ * veth behavior
+ */
+ veth_disable_gro(peer);
netif_carrier_off(peer);
err = rtnl_configure_link(peer, ifmp);
@@ -1438,6 +1561,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+ veth_disable_gro(dev);
return 0;
err_register_dev:
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 2d71b283dde3..f4242a961088 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -24,6 +24,7 @@ TEST_PROGS += vrf_route_leaking.sh
TEST_PROGS += bareudp.sh
TEST_PROGS += unicast_extensions.sh
TEST_PROGS += udpgro_fwd.sh
+TEST_PROGS += veth.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh
new file mode 100755
index 000000000000..2fedc0781ce8
--- /dev/null
+++ b/tools/testing/selftests/net/veth.sh
@@ -0,0 +1,177 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
+readonly BASE=`basename $STATS`
+readonly SRC=2
+readonly DST=1
+readonly DST_NAT=100
+readonly NS_SRC=$BASE$SRC
+readonly NS_DST=$BASE$DST
+
+# "baremetal" network used for raw UDP traffic
+readonly BM_NET_V4=192.168.1.
+readonly BM_NET_V6=2001:db8::
+
+readonly NPROCS=`nproc`
+ret=0
+
+cleanup() {
+ local ns
+ local -r jobs="$(jobs -p)"
+ [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+ rm -f $STATS
+
+ for ns in $NS_SRC $NS_DST; do
+ ip netns del $ns 2>/dev/null
+ done
+}
+
+trap cleanup EXIT
+
+create_ns() {
+ local ns
+
+ for ns in $NS_SRC $NS_DST; do
+ ip netns add $ns
+ ip -n $ns link set dev lo up
+ done
+
+ ip link add name veth$SRC type veth peer name veth$DST
+
+ for ns in $SRC $DST; do
+ ip link set dev veth$ns netns $BASE$ns up
+ ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
+ ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
+ done
+ echo "#kernel" > $BASE
+ chmod go-rw $BASE
+}
+
+__chk_flag() {
+ local msg="$1"
+ local target=$2
+ local expected=$3
+ local flagname=$4
+
+ local flag=`ip netns exec $BASE$target ethtool -k veth$target |\
+ grep $flagname | awk '{print $2}'`
+
+ printf "%-60s" "$msg"
+ if [ "$flag" = "$expected" ]; then
+ echo " ok "
+ else
+ echo " fail - expected $expected found $flag"
+ ret=1
+ fi
+}
+
+chk_gro_flag() {
+ __chk_flag "$1" $2 $3 generic-receive-offload
+}
+
+chk_tso_flag() {
+ __chk_flag "$1" $2 $3 tcp-segmentation-offload
+}
+
+chk_gro() {
+ local msg="$1"
+ local expected=$2
+
+ ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null
+ NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n
+
+ printf "%-60s" "$msg"
+ ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 &
+ local spid=$!
+ sleep 0.1
+
+ ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST
+ local retc=$?
+ wait $spid
+ local rets=$?
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo " fail client exit code $retc, server $rets"
+ ret=1
+ return
+ fi
+
+ local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \
+ awk '{print $2}' | tail -n 1`
+ if [ "$pkts" = "$expected" ]; then
+ echo " ok "
+ else
+ echo " fail - got $pkts packets, expected $expected "
+ ret=1
+ fi
+}
+
+if [ ! -f ../bpf/xdp_dummy.o ]; then
+ echo "Missing xdp_dummy helper. Build bpf selftest first"
+ exit -1
+fi
+
+create_ns
+chk_gro_flag "default - gro flag" $SRC off
+chk_gro_flag " - peer gro flag" $DST off
+chk_tso_flag " - tso flag" $SRC on
+chk_tso_flag " - peer tso flag" $DST on
+chk_gro " - aggregation" 1
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+chk_gro " - aggregation with TSO off" 10
+cleanup
+
+create_ns
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+chk_gro_flag "with gro on - gro flag" $DST on
+chk_gro_flag " - peer gro flag" $SRC off
+chk_tso_flag " - tso flag" $SRC on
+chk_tso_flag " - peer tso flag" $DST on
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro " - aggregation with TSO off" 1
+cleanup
+
+create_ns
+ip -n $NS_DST link set dev veth$DST down
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+chk_gro_flag "with gro enabled on link down - gro flag" $DST on
+chk_gro_flag " - peer gro flag" $SRC off
+chk_tso_flag " - tso flag" $SRC on
+chk_tso_flag " - peer tso flag" $DST on
+ip -n $NS_DST link set dev veth$DST up
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro " - aggregation with TSO off" 1
+cleanup
+
+create_ns
+ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
+chk_gro_flag "with xdp attached - gro flag" $DST on
+chk_gro_flag " - peer gro flag" $SRC off
+chk_tso_flag " - tso flag" $SRC off
+chk_tso_flag " - peer tso flag" $DST on
+ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
+chk_gro " - aggregation" 1
+
+
+ip -n $NS_DST link set dev veth$DST down
+ip -n $NS_SRC link set dev veth$SRC down
+chk_gro_flag " - after dev off, flag" $DST on
+chk_gro_flag " - peer flag" $SRC off
+
+ip netns exec $NS_DST ethtool -K veth$DST gro on
+ip -n $NS_DST link set dev veth$DST xdp off
+chk_gro_flag " - after gro on xdp off, gro flag" $DST on
+chk_gro_flag " - peer gro flag" $SRC off
+chk_tso_flag " - tso flag" $SRC on
+chk_tso_flag " - peer tso flag" $DST on
+ip -n $NS_DST link set dev veth$DST up
+ip -n $NS_SRC link set dev veth$SRC up
+chk_gro " - aggregation" 1
+
+ip netns exec $NS_DST ethtool -K veth$DST gro off
+ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
+chk_gro "aggregation again with default and TSO off" 10
+
+exit $ret