Merge branch 'virtio_net-XDP'

John Fastabend says: ==================== XDP for virtio_net This implements virtio_net for the mergeable buffers and big_packet modes. I tested this with vhost_net running on qemu and did not see any issues. For testing num_buf > 1 I added a hack to vhost driver to only but 100 bytes per buffer. There are some restrictions for XDP to be enabled and work well (see patch 3) for more details. 1. GUEST_TSO{4|6} must be off 2. MTU must be less than PAGE_SIZE 3. queues must be available to dedicate to XDP 4. num_bufs received in mergeable buffers must be 1 5. big_packet mode must have all data on single page To test this I used pktgen in the hypervisor and ran the XDP sample programs xdp1 and xdp2 from ./samples/bpf in the host. The default mode that is used with these patches with Linux guest and QEMU/Linux hypervisor is the mergeable buffers mode. I tested this mode for 2+ days running xdp2 without issues. Additionally I did a series of driver unload/load tests to check the allocate/release paths. To test the big_packets path I applied the following simple patch against the virtio driver forcing big_packets mode, --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2242,7 +2242,7 @@ static int virtnet_probe(struct virtio_device *vdev) vi->big_packets = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) - vi->mergeable_rx_bufs = true; + vi->mergeable_rx_bufs = false; if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) I then repeated the tests with xdp1 and xdp2. After letting them run for a few hours I called it good enough. Testing the unexpected case where virtio receives a packet across multiple buffers required patching the hypervisor vhost driver to convince it to send these unexpected packets. Then I used ping with the -s option to trigger the case with multiple buffers. This mode is not expected to be used but as MST pointed out per spec it is not strictly speaking illegal to generate multi-buffer packets so we need someway to handle these. The following patch can be used to generate multiple buffers, --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1777,7 +1777,8 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 _iov = iov + ret; size = node->size - addr + node->start; - _iov->iov_len = min((u64)len - s, size); + printk("%s: build 100 length headers!\n", __func__); + _iov->iov_len = min((u64)len - s, (u64)100);//size); _iov->iov_base = (void __user *)(unsigned long) (node->userspace_addr + addr - node->start); s += size; The qemu command I most frequently used for testing (although I did test various other combinations of devices) is the following, ./x86_64-softmmu/qemu-system-x86_64 \ -hda /var/lib/libvirt/images/Fedora-test0.img \ -m 4096 -enable-kvm -smp 2 \ -netdev tap,id=hn0,queues=4,vhost=on \ -device virtio-net-pci,netdev=hn0,mq=on,vectors=9,guest_tso4=off,guest_tso6=off \ -serial stdio The options 'guest_tso4=off,guest_tso6=off' are required because we do not support LRO with XDP at the moment. Please review any comments/feedback welcome as always. ==================== Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2016-12-17 11:48:56 -0500
committer: David S. Miller <davem@davemloft.net> 2016-12-17 11:48:56 -0500
commit: cd333e377f803dbf8ec7588f598b70d4a1178f58 (patch)
tree: d622f48e13643a9adc2de081af61042757c5522e
parent: 08abb79542c9e8c367d1d8e44fe1026868d3f0a7 (diff)
parent: 72979a6c35907b6a7ab85e7bc60e0d52dba68f9d (diff)
3 files changed, 365 insertions, 7 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fe9f7726e990..08327e005ccc 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
+#include <linux/bpf.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -81,6 +82,8 @@ struct receive_queue {
 
 	struct napi_struct napi;
 
+	struct bpf_prog __rcu *xdp_prog;
+
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
@@ -111,6 +114,9 @@ struct virtnet_info {
 	/* # of queue pairs currently used by the driver */
 	u16 curr_queue_pairs;
 
+	/* # of XDP queue pairs currently used by the driver */
+	u16 xdp_queue_pairs;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
@@ -324,6 +330,90 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+static void virtnet_xdp_xmit(struct virtnet_info *vi,
+			     struct receive_queue *rq,
+			     struct send_queue *sq,
+			     struct xdp_buff *xdp)
+{
+	struct page *page = virt_to_head_page(xdp->data);
+	struct virtio_net_hdr_mrg_rxbuf *hdr;
+	unsigned int num_sg, len;
+	void *xdp_sent;
+	int err;
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
+		struct page *sent_page = virt_to_head_page(xdp_sent);
+
+		if (vi->mergeable_rx_bufs)
+			put_page(sent_page);
+		else
+			give_pages(rq, sent_page);
+	}
+
+	/* Zero header and leave csum up to XDP layers */
+	hdr = xdp->data;
+	memset(hdr, 0, vi->hdr_len);
+
+	num_sg = 1;
+	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
+				   xdp->data, GFP_ATOMIC);
+	if (unlikely(err)) {
+		if (vi->mergeable_rx_bufs)
+			put_page(page);
+		else
+			give_pages(rq, page);
+		return; // On error abort to avoid unnecessary kick
+	} else if (!vi->mergeable_rx_bufs) {
+		/* If not mergeable bufs must be big packets so cleanup pages */
+		give_pages(rq, (struct page *)page->private);
+		page->private = 0;
+	}
+
+	virtqueue_kick(sq->vq);
+}
+
+static u32 do_xdp_prog(struct virtnet_info *vi,
+		       struct receive_queue *rq,
+		       struct bpf_prog *xdp_prog,
+		       struct page *page, int offset, int len)
+{
+	int hdr_padded_len;
+	struct xdp_buff xdp;
+	unsigned int qp;
+	u32 act;
+	u8 *buf;
+
+	buf = page_address(page) + offset;
+
+	if (vi->mergeable_rx_bufs)
+		hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	else
+		hdr_padded_len = sizeof(struct padded_vnet_hdr);
+
+	xdp.data = buf + hdr_padded_len;
+	xdp.data_end = xdp.data + (len - vi->hdr_len);
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	switch (act) {
+	case XDP_PASS:
+		return XDP_PASS;
+	case XDP_TX:
+		qp = vi->curr_queue_pairs -
+			vi->xdp_queue_pairs +
+			smp_processor_id();
+		xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
+		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
+		return XDP_TX;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+	case XDP_DROP:
+		return XDP_DROP;
+	}
+}
+
 static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
@@ -340,17 +430,102 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   void *buf,
 				   unsigned int len)
 {
+	struct bpf_prog *xdp_prog;
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb;
 
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+		u32 act;
+
+		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+			goto err_xdp;
+		act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
+		switch (act) {
+		case XDP_PASS:
+			break;
+		case XDP_TX:
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
+			goto err_xdp;
+		}
+	}
+	rcu_read_unlock();
+
+	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
 	return skb;
 
+err_xdp:
+	rcu_read_unlock();
 err:
 	dev->stats.rx_dropped++;
 	give_pages(rq, page);
+xdp_xmit:
+	return NULL;
+}
+
+/* The conditions to enable XDP should preclude the underlying device from
+ * sending packets across multiple buffers (num_buf > 1). However per spec
+ * it does not appear to be illegal to do so but rather just against convention.
+ * So in order to avoid making a system unresponsive the packets are pushed
+ * into a page and the XDP program is run. This will be extremely slow and we
+ * push a warning to the user to fix this as soon as possible. Fixing this may
+ * require resolving the underlying hardware to determine why multiple buffers
+ * are being received or simply loading the XDP program in the ingress stack
+ * after the skb is built because there is no advantage to running it here
+ * anymore.
+ */
+static struct page *xdp_linearize_page(struct receive_queue *rq,
+				       u16 num_buf,
+				       struct page *p,
+				       int offset,
+				       unsigned int *len)
+{
+	struct page *page = alloc_page(GFP_ATOMIC);
+	unsigned int page_off = 0;
+
+	if (!page)
+		return NULL;
+
+	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
+	page_off += *len;
+
+	while (--num_buf) {
+		unsigned int buflen;
+		unsigned long ctx;
+		void *buf;
+		int off;
+
+		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
+		if (unlikely(!ctx))
+			goto err_buf;
+
+		/* guard against a misconfigured or uncooperative backend that
+		 * is sending packet larger than the MTU.
+		 */
+		if ((page_off + buflen) > PAGE_SIZE)
+			goto err_buf;
+
+		buf = mergeable_ctx_to_buf_address(ctx);
+		p = virt_to_head_page(buf);
+		off = buf - page_address(p);
+
+		memcpy(page_address(page) + page_off,
+		       page_address(p) + off, buflen);
+		page_off += buflen;
+	}
+
+	*len = page_off;
+	return page;
+err_buf:
+	__free_pages(page, 0);
 	return NULL;
 }
 
@@ -365,11 +540,67 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	struct sk_buff *head_skb, *curr_skb;
+	struct bpf_prog *xdp_prog;
+	unsigned int truesize;
+
+	head_skb = NULL;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		struct page *xdp_page;
+		u32 act;
+
+		/* No known backend devices should send packets with
+		 * more than a single buffer when XDP conditions are
+		 * met. However it is not strictly illegal so the case
+		 * is handled as an exception and a warning is thrown.
+		 */
+		if (unlikely(num_buf > 1)) {
+			bpf_warn_invalid_xdp_buffer();
+
+			/* linearize data for XDP */
+			xdp_page = xdp_linearize_page(rq, num_buf,
+						      page, offset, &len);
+			if (!xdp_page)
+				goto err_xdp;
+			offset = 0;
+		} else {
+			xdp_page = page;
+		}
+
+		/* Transient failure which in theory could occur if
+		 * in-flight packets from before XDP was enabled reach
+		 * the receive path after XDP is loaded. In practice I
+		 * was not able to create this condition.
+		 */
+		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
+			goto err_xdp;
+
+		act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
+		switch (act) {
+		case XDP_PASS:
+			if (unlikely(xdp_page != page))
+				__free_pages(xdp_page, 0);
+			break;
+		case XDP_TX:
+			if (unlikely(xdp_page != page))
+				goto err_xdp;
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_DROP:
+		default:
+			if (unlikely(xdp_page != page))
+				__free_pages(xdp_page, 0);
+			goto err_xdp;
+		}
+	}
+	rcu_read_unlock();
 
-	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
-					       truesize);
-	struct sk_buff *curr_skb = head_skb;
+	truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
+	curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
@@ -423,6 +654,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return head_skb;
 
+err_xdp:
+	rcu_read_unlock();
 err_skb:
 	put_page(page);
 	while (--num_buf) {
@@ -439,6 +672,7 @@ err_skb:
 err_buf:
 	dev->stats.rx_dropped++;
 	dev_kfree_skb(head_skb);
+xdp_xmit:
 	return NULL;
 }
 
@@ -1337,6 +1571,13 @@ static int virtnet_set_channels(struct net_device *dev,
 	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
 		return -EINVAL;
 
+	/* For now we don't support modifying channels while XDP is loaded
+	 * also when XDP is loaded all RX queues have XDP programs so we only
+	 * need to check a single RX queue.
+	 */
+	if (vi->rq[0].xdp_prog)
+		return -EINVAL;
+
 	get_online_cpus();
 	err = virtnet_set_queues(vi, queue_pairs);
 	if (!err) {
@@ -1428,6 +1669,93 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
 	.set_settings = virtnet_set_settings,
 };
 
+static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	u16 xdp_qp = 0, curr_qp;
+	int i, err;
+
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
+	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+		netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
+		netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
+		return -EINVAL;
+	}
+
+	if (dev->mtu > max_sz) {
+		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
+		return -EINVAL;
+	}
+
+	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
+	if (prog)
+		xdp_qp = nr_cpu_ids;
+
+	/* XDP requires extra queues for XDP_TX */
+	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
+		netdev_warn(dev, "request %i queues but max is %i\n",
+			    curr_qp + xdp_qp, vi->max_queue_pairs);
+		return -ENOMEM;
+	}
+
+	err = virtnet_set_queues(vi, curr_qp + xdp_qp);
+	if (err) {
+		dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
+		return err;
+	}
+
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog)) {
+			virtnet_set_queues(vi, curr_qp);
+			return PTR_ERR(prog);
+		}
+	}
+
+	vi->xdp_queue_pairs = xdp_qp;
+	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static bool virtnet_xdp_query(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (vi->rq[i].xdp_prog)
+			return true;
+	}
+	return false;
+}
+
+static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return virtnet_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = virtnet_xdp_query(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1444,6 +1772,7 @@ static const struct net_device_ops virtnet_netdev = {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
+	.ndo_xdp		= virtnet_xdp,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1505,12 +1834,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 
 static void free_receive_bufs(struct virtnet_info *vi)
 {
+	struct bpf_prog *old_prog;
 	int i;
 
+	rtnl_lock();
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		while (vi->rq[i].pages)
 			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+
+		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
+		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
+		if (old_prog)
+			bpf_prog_put(old_prog);
 	}
+	rtnl_unlock();
 }
 
 static void free_receive_page_frags(struct virtnet_info *vi)
@@ -1521,6 +1858,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
 			put_page(vi->rq[i].alloc_frag.page);
 }
 
+static bool is_xdp_queue(struct virtnet_info *vi, int q)
+{
+	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
+		return false;
+	else if (q < vi->curr_queue_pairs)
+		return true;
+	else
+		return false;
+}
+
 static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
@@ -1528,8 +1875,12 @@ static void free_unused_bufs(struct virtnet_info *vi)
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		struct virtqueue *vq = vi->sq[i].vq;
-		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
-			dev_kfree_skb(buf);
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+			if (!is_xdp_queue(vi, i))
+				dev_kfree_skb(buf);
+			else
+				put_page(virt_to_head_page(buf));
+		}
 	}
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6a1658308612..af8a1804cac6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -602,6 +602,7 @@ bool bpf_helper_changes_pkt_data(void *func);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_buffer(void);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/net/core/filter.c b/net/core/filter.c
index b1461708a977..7190bd648154 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2972,6 +2972,12 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+void bpf_warn_invalid_xdp_buffer(void)
+{
+	WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n");
+}
+EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
+
 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					int src_reg, int ctx_off,
 					struct bpf_insn *insn_buf,
author	David S. Miller <davem@davemloft.net>	2016-12-17 11:48:56 -0500
committer	David S. Miller <davem@davemloft.net>	2016-12-17 11:48:56 -0500
commit	cd333e377f803dbf8ec7588f598b70d4a1178f58 (patch)
tree	d622f48e13643a9adc2de081af61042757c5522e
parent	08abb79542c9e8c367d1d8e44fe1026868d3f0a7 (diff)
parent	72979a6c35907b6a7ab85e7bc60e0d52dba68f9d (diff)