diff options
author | David S. Miller <davem@davemloft.net> | 2017-05-18 10:07:42 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-05-18 10:07:42 -0400 |
commit | f646c75b79c31a7fc40211e7291287e2b99ee168 (patch) | |
tree | 240db037b0414eda56435162247acabd75dd1477 | |
parent | 1fc4d180b3c6bed0e7f5160bcd553aec89594962 (diff) | |
parent | c67df11f6e48061e43e9bf9dade83fe268b47d27 (diff) |
Merge branch 'vhost_net-rx-batch-dequeuing'
Jason Wang says:
====================
vhost_net rx batch dequeuing
This series tries to implement rx batching for vhost-net. This is done
by batching the dequeuing from skb_array which was exported by
underlayer socket and pass the sbk back through msg_control to finish
userspace copying. This is also the requirement for more batching
implemention on rx path.
Tests shows at most 7.56% improvment bon rx pps on top of batch
zeroing and no obvious changes for TCP_STREAM/TCP_RR result.
Please review.
Thanks
Changes from V4:
- drop batch zeroing patch
- renew the performance numbers
- move skb pointer array out of vhost_net structure
Changes from V3:
- add batch zeroing patch to fix the build warnings
Changes from V2:
- rebase to net-next HEAD
- use unconsume helpers to put skb back on releasing
- introduce and use vhost_net internal buffer helpers
- renew performance numbers on top of batch zeroing
Changes from V1:
- switch to use for() in __ptr_ring_consume_batched()
- rename peek_head_len_batched() to fetch_skbs()
- use skb_array_consume_batched() instead of
skb_array_consume_batched_bh() since no consumer run in bh
- drop the lockless peeking patch since skb_array could be resized, so
it's not safe to call lockless one
====================
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/tap.c | 25 | ||||
-rw-r--r-- | drivers/net/tun.c | 31 | ||||
-rw-r--r-- | drivers/vhost/net.c | 128 | ||||
-rw-r--r-- | include/linux/if_tap.h | 5 | ||||
-rw-r--r-- | include/linux/if_tun.h | 5 | ||||
-rw-r--r-- | include/linux/ptr_ring.h | 120 | ||||
-rw-r--r-- | include/linux/skb_array.h | 31 |
7 files changed, 327 insertions, 18 deletions
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 4d4173d25dd0..9af3239d6ad5 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -824,15 +824,17 @@ done: static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, - int noblock) + int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); - struct sk_buff *skb; ssize_t ret = 0; if (!iov_iter_count(to)) return 0; + if (skb) + goto put; + while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, @@ -856,6 +858,7 @@ static ssize_t tap_do_read(struct tap_queue *q, if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); +put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) @@ -872,7 +875,7 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; - ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK); + ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; @@ -1155,7 +1158,8 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m, int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT); + ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, + m->msg_control); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; @@ -1193,6 +1197,19 @@ struct socket *tap_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tap_get_socket); +struct skb_array *tap_get_skb_array(struct file *file) +{ + struct tap_queue *q; + + if (file->f_op != &tap_fops) + return ERR_PTR(-EINVAL); + q = file->private_data; + if (!q) + return ERR_PTR(-EBADFD); + return &q->skb_array; +} +EXPORT_SYMBOL_GPL(tap_get_skb_array); + int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bbd707b9ef7a..f8041f9c7e65 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1510,9 +1510,8 @@ out: static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, - int noblock) + int noblock, struct sk_buff *skb) { - struct sk_buff *skb; ssize_t ret; int err; @@ -1521,10 +1520,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, if (!iov_iter_count(to)) return 0; - /* Read frames from ring */ - skb = tun_ring_recv(tfile, noblock, &err); - if (!skb) - return err; + if (!skb) { + /* Read frames from ring */ + skb = tun_ring_recv(tfile, noblock, &err); + if (!skb) + return err; + } ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) @@ -1544,7 +1545,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!tun) return -EBADFD; - ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK); + ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; @@ -1646,7 +1647,8 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT); + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, + m->msg_control); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; @@ -2626,6 +2628,19 @@ struct socket *tun_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tun_get_socket); +struct skb_array *tun_get_skb_array(struct file *file) +{ + struct tun_file *tfile; + + if (file->f_op != &tun_fops) + return ERR_PTR(-EINVAL); + tfile = file->private_data; + if (!tfile) + return ERR_PTR(-EBADFD); + return &tfile->tx_array; +} +EXPORT_SYMBOL_GPL(tun_get_skb_array); + module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f61f852d6cfd..e3d7ea1288c6 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -28,6 +28,8 @@ #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> +#include <linux/skb_array.h> +#include <linux/skbuff.h> #include <net/sock.h> @@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref { struct vhost_virtqueue *vq; }; +#define VHOST_RX_BATCH 64 +struct vhost_net_buf { + struct sk_buff **queue; + int tail; + int head; +}; + struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; @@ -99,6 +108,8 @@ struct vhost_net_virtqueue { /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; + struct skb_array *rx_array; + struct vhost_net_buf rxq; }; struct vhost_net { @@ -117,6 +128,71 @@ struct vhost_net { static unsigned vhost_net_zcopy_mask __read_mostly; +static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) +{ + if (rxq->tail != rxq->head) + return rxq->queue[rxq->head]; + else + return NULL; +} + +static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) +{ + return rxq->tail - rxq->head; +} + +static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) +{ + return rxq->tail == rxq->head; +} + +static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) +{ + void *ret = vhost_net_buf_get_ptr(rxq); + ++rxq->head; + return ret; +} + +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + rxq->head = 0; + rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue, + VHOST_RX_BATCH); + return rxq->tail; +} + +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) { + skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq)); + rxq->head = rxq->tail = 0; + } +} + +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + if (!vhost_net_buf_is_empty(rxq)) + goto out; + + if (!vhost_net_buf_produce(nvq)) + return 0; + +out: + return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq)); +} + +static void vhost_net_buf_init(struct vhost_net_buf *rxq) +{ + rxq->head = rxq->tail = 0; +} + static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; @@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n) n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; + vhost_net_buf_init(&n->vqs[i].rxq); } } @@ -503,15 +580,14 @@ out: mutex_unlock(&vq->mutex); } -static int peek_head_len(struct sock *sk) +static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { - struct socket *sock = sk->sk_socket; struct sk_buff *head; int len = 0; unsigned long flags; - if (sock->ops->peek_len) - return sock->ops->peek_len(sock); + if (rvq->rx_array) + return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); @@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk) static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) { + struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned long uninitialized_var(endtime); - int len = peek_head_len(sk); + int len = peek_head_len(rvq, sk); if (!len && vq->busyloop_timeout) { /* Both tx vq and rx socket were polled here */ @@ -561,7 +638,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) vhost_poll_queue(&vq->poll); mutex_unlock(&vq->mutex); - len = peek_head_len(sk); + len = peek_head_len(rvq, sk); } return len; @@ -699,6 +776,8 @@ static void handle_rx(struct vhost_net *net) /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; + if (nvq->rx_array) + msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); @@ -815,6 +894,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; + struct sk_buff **queue; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_REPEAT); @@ -826,6 +906,15 @@ static int vhost_net_open(struct inode *inode, struct file *f) return -ENOMEM; } + queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *), + GFP_KERNEL); + if (!queue) { + kfree(vqs); + kvfree(n); + return -ENOMEM; + } + n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; + dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; @@ -838,6 +927,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) n->vqs[i].done_idx = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; + vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); @@ -853,11 +943,14 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vq->private_data; vhost_net_disable_vq(n, vq); vq->private_data = NULL; + vhost_net_buf_unproduce(nvq); mutex_unlock(&vq->mutex); return sock; } @@ -912,6 +1005,7 @@ static int vhost_net_release(struct inode *inode, struct file *f) /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); + kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->dev.vqs); kvfree(n); return 0; @@ -950,6 +1044,25 @@ err: return ERR_PTR(r); } +static struct skb_array *get_tap_skb_array(int fd) +{ + struct skb_array *array; + struct file *file = fget(fd); + + if (!file) + return NULL; + array = tun_get_skb_array(file); + if (!IS_ERR(array)) + goto out; + array = tap_get_skb_array(file); + if (!IS_ERR(array)) + goto out; + array = NULL; +out: + fput(file); + return array; +} + static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); @@ -1026,6 +1139,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) vhost_net_disable_vq(n, vq); vq->private_data = sock; + vhost_net_buf_unproduce(nvq); + if (index == VHOST_NET_VQ_RX) + nvq->rx_array = get_tap_skb_array(fd); r = vhost_vq_init_access(vq); if (r) goto err_used; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 3482c3c2037d..4837157da0dc 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -3,6 +3,7 @@ #if IS_ENABLED(CONFIG_TAP) struct socket *tap_get_socket(struct file *); +struct skb_array *tap_get_skb_array(struct file *file); #else #include <linux/err.h> #include <linux/errno.h> @@ -12,6 +13,10 @@ static inline struct socket *tap_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } +static inline struct skb_array *tap_get_skb_array(struct file *f) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_TAP */ #include <net/sock.h> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index ed6da2e6df90..bf9bdf42d577 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -19,6 +19,7 @@ #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); +struct skb_array *tun_get_skb_array(struct file *file); #else #include <linux/err.h> #include <linux/errno.h> @@ -28,5 +29,9 @@ static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } +static inline struct skb_array *tun_get_skb_array(struct file *f) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */ diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6b2e0dd88569..d8c97ec8a8e6 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -278,6 +278,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) return ptr; } +static inline int __ptr_ring_consume_batched(struct ptr_ring *r, + void **array, int n) +{ + void *ptr; + int i; + + for (i = 0; i < n; i++) { + ptr = __ptr_ring_consume(r); + if (!ptr) + break; + array[i] = ptr; + } + + return i; +} + /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when @@ -328,6 +344,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r) return ptr; } +static inline int ptr_ring_consume_batched(struct ptr_ring *r, + void **array, int n) +{ + int ret; + + spin_lock(&r->consumer_lock); + ret = __ptr_ring_consume_batched(r, array, n); + spin_unlock(&r->consumer_lock); + + return ret; +} + +static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, + void **array, int n) +{ + int ret; + + spin_lock_irq(&r->consumer_lock); + ret = __ptr_ring_consume_batched(r, array, n); + spin_unlock_irq(&r->consumer_lock); + + return ret; +} + +static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, + void **array, int n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&r->consumer_lock, flags); + ret = __ptr_ring_consume_batched(r, array, n); + spin_unlock_irqrestore(&r->consumer_lock, flags); + + return ret; +} + +static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, + void **array, int n) +{ + int ret; + + spin_lock_bh(&r->consumer_lock); + ret = __ptr_ring_consume_batched(r, array, n); + spin_unlock_bh(&r->consumer_lock); + + return ret; +} + /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. @@ -403,6 +468,61 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) return 0; } +/* + * Return entries into ring. Destroy entries that don't fit. + * + * Note: this is expected to be a rare slow path operation. + * + * Note: producer lock is nested within consumer lock, so if you + * resize you must make sure all uses nest correctly. + * In particular if you consume ring in interrupt or BH context, you must + * disable interrupts/BH when doing so. + */ +static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, + void (*destroy)(void *)) +{ + unsigned long flags; + int head; + + spin_lock_irqsave(&r->consumer_lock, flags); + spin_lock(&r->producer_lock); + + if (!r->size) + goto done; + + /* + * Clean out buffered entries (for simplicity). This way following code + * can test entries for NULL and if not assume they are valid. + */ + head = r->consumer_head - 1; + while (likely(head >= r->consumer_tail)) + r->queue[head--] = NULL; + r->consumer_tail = r->consumer_head; + + /* + * Go over entries in batch, start moving head back and copy entries. + * Stop when we run into previously unconsumed entries. + */ + while (n) { + head = r->consumer_head - 1; + if (head < 0) + head = r->size - 1; + if (r->queue[head]) { + /* This batch entry will have to be destroyed. */ + goto done; + } + r->queue[head] = batch[--n]; + r->consumer_tail = r->consumer_head = head; + } + +done: + /* Destroy all entries left in the batch. */ + while (n) + destroy(batch[--n]); + spin_unlock(&r->producer_lock); + spin_unlock_irqrestore(&r->consumer_lock, flags); +} + static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index f4dfade428f0..35226cd4efb0 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -97,21 +97,46 @@ static inline struct sk_buff *skb_array_consume(struct skb_array *a) return ptr_ring_consume(&a->ring); } +static inline int skb_array_consume_batched(struct skb_array *a, + struct sk_buff **array, int n) +{ + return ptr_ring_consume_batched(&a->ring, (void **)array, n); +} + static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } +static inline int skb_array_consume_batched_irq(struct skb_array *a, + struct sk_buff **array, int n) +{ + return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); +} + static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } +static inline int skb_array_consume_batched_any(struct skb_array *a, + struct sk_buff **array, int n) +{ + return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); +} + + static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } +static inline int skb_array_consume_batched_bh(struct skb_array *a, + struct sk_buff **array, int n) +{ + return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); +} + static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { @@ -156,6 +181,12 @@ static void __skb_array_destroy_skb(void *ptr) kfree_skb(ptr); } +static inline void skb_array_unconsume(struct skb_array *a, + struct sk_buff **skbs, int n) +{ + ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); +} + static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); |