summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/net/tap.c25
-rw-r--r--drivers/net/tun.c31
-rw-r--r--drivers/vhost/net.c128
-rw-r--r--include/linux/if_tap.h5
-rw-r--r--include/linux/if_tun.h5
-rw-r--r--include/linux/ptr_ring.h120
-rw-r--r--include/linux/skb_array.h31
7 files changed, 327 insertions, 18 deletions
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 4d4173d25dd0..9af3239d6ad5 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -824,15 +824,17 @@ done:
static ssize_t tap_do_read(struct tap_queue *q,
struct iov_iter *to,
- int noblock)
+ int noblock, struct sk_buff *skb)
{
DEFINE_WAIT(wait);
- struct sk_buff *skb;
ssize_t ret = 0;
if (!iov_iter_count(to))
return 0;
+ if (skb)
+ goto put;
+
while (1) {
if (!noblock)
prepare_to_wait(sk_sleep(&q->sk), &wait,
@@ -856,6 +858,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
if (!noblock)
finish_wait(sk_sleep(&q->sk), &wait);
+put:
if (skb) {
ret = tap_put_user(q, skb, to);
if (unlikely(ret < 0))
@@ -872,7 +875,7 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct tap_queue *q = file->private_data;
ssize_t len = iov_iter_count(to), ret;
- ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK);
+ ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1155,7 +1158,8 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m,
int ret;
if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
return -EINVAL;
- ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
+ ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
@@ -1193,6 +1197,19 @@ struct socket *tap_get_socket(struct file *file)
}
EXPORT_SYMBOL_GPL(tap_get_socket);
+struct skb_array *tap_get_skb_array(struct file *file)
+{
+ struct tap_queue *q;
+
+ if (file->f_op != &tap_fops)
+ return ERR_PTR(-EINVAL);
+ q = file->private_data;
+ if (!q)
+ return ERR_PTR(-EBADFD);
+ return &q->skb_array;
+}
+EXPORT_SYMBOL_GPL(tap_get_skb_array);
+
int tap_queue_resize(struct tap_dev *tap)
{
struct net_device *dev = tap->dev;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bbd707b9ef7a..f8041f9c7e65 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1510,9 +1510,8 @@ out:
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
- int noblock)
+ int noblock, struct sk_buff *skb)
{
- struct sk_buff *skb;
ssize_t ret;
int err;
@@ -1521,10 +1520,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
if (!iov_iter_count(to))
return 0;
- /* Read frames from ring */
- skb = tun_ring_recv(tfile, noblock, &err);
- if (!skb)
- return err;
+ if (!skb) {
+ /* Read frames from ring */
+ skb = tun_ring_recv(tfile, noblock, &err);
+ if (!skb)
+ return err;
+ }
ret = tun_put_user(tun, tfile, skb, to);
if (unlikely(ret < 0))
@@ -1544,7 +1545,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!tun)
return -EBADFD;
- ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK);
+ ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
@@ -1646,7 +1647,8 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
SOL_PACKET, TUN_TX_TIMESTAMP);
goto out;
}
- ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT);
+ ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
+ m->msg_control);
if (ret > (ssize_t)total_len) {
m->msg_flags |= MSG_TRUNC;
ret = flags & MSG_TRUNC ? ret : total_len;
@@ -2626,6 +2628,19 @@ struct socket *tun_get_socket(struct file *file)
}
EXPORT_SYMBOL_GPL(tun_get_socket);
+struct skb_array *tun_get_skb_array(struct file *file)
+{
+ struct tun_file *tfile;
+
+ if (file->f_op != &tun_fops)
+ return ERR_PTR(-EINVAL);
+ tfile = file->private_data;
+ if (!tfile)
+ return ERR_PTR(-EBADFD);
+ return &tfile->tx_array;
+}
+EXPORT_SYMBOL_GPL(tun_get_skb_array);
+
module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f61f852d6cfd..e3d7ea1288c6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
#include <linux/if_macvlan.h>
#include <linux/if_tap.h>
#include <linux/if_vlan.h>
+#include <linux/skb_array.h>
+#include <linux/skbuff.h>
#include <net/sock.h>
@@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
struct vhost_virtqueue *vq;
};
+#define VHOST_RX_BATCH 64
+struct vhost_net_buf {
+ struct sk_buff **queue;
+ int tail;
+ int head;
+};
+
struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;
@@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
+ struct skb_array *rx_array;
+ struct vhost_net_buf rxq;
};
struct vhost_net {
@@ -117,6 +128,71 @@ struct vhost_net {
static unsigned vhost_net_zcopy_mask __read_mostly;
+static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
+{
+ if (rxq->tail != rxq->head)
+ return rxq->queue[rxq->head];
+ else
+ return NULL;
+}
+
+static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
+{
+ return rxq->tail - rxq->head;
+}
+
+static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
+{
+ return rxq->tail == rxq->head;
+}
+
+static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
+{
+ void *ret = vhost_net_buf_get_ptr(rxq);
+ ++rxq->head;
+ return ret;
+}
+
+static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+{
+ struct vhost_net_buf *rxq = &nvq->rxq;
+
+ rxq->head = 0;
+ rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+ VHOST_RX_BATCH);
+ return rxq->tail;
+}
+
+static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
+{
+ struct vhost_net_buf *rxq = &nvq->rxq;
+
+ if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
+ skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
+ vhost_net_buf_get_size(rxq));
+ rxq->head = rxq->tail = 0;
+ }
+}
+
+static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+{
+ struct vhost_net_buf *rxq = &nvq->rxq;
+
+ if (!vhost_net_buf_is_empty(rxq))
+ goto out;
+
+ if (!vhost_net_buf_produce(nvq))
+ return 0;
+
+out:
+ return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+}
+
+static void vhost_net_buf_init(struct vhost_net_buf *rxq)
+{
+ rxq->head = rxq->tail = 0;
+}
+
static void vhost_net_enable_zcopy(int vq)
{
vhost_net_zcopy_mask |= 0x1 << vq;
@@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+ vhost_net_buf_init(&n->vqs[i].rxq);
}
}
@@ -503,15 +580,14 @@ out:
mutex_unlock(&vq->mutex);
}
-static int peek_head_len(struct sock *sk)
+static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
{
- struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
+ if (rvq->rx_array)
+ return vhost_net_buf_peek(rvq);
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
@@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk)
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
+ struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned long uninitialized_var(endtime);
- int len = peek_head_len(sk);
+ int len = peek_head_len(rvq, sk);
if (!len && vq->busyloop_timeout) {
/* Both tx vq and rx socket were polled here */
@@ -561,7 +638,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
vhost_poll_queue(&vq->poll);
mutex_unlock(&vq->mutex);
- len = peek_head_len(sk);
+ len = peek_head_len(rvq, sk);
}
return len;
@@ -699,6 +776,8 @@ static void handle_rx(struct vhost_net *net)
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
+ if (nvq->rx_array)
+ msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
@@ -815,6 +894,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
struct vhost_net *n;
struct vhost_dev *dev;
struct vhost_virtqueue **vqs;
+ struct sk_buff **queue;
int i;
n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_REPEAT);
@@ -826,6 +906,15 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return -ENOMEM;
}
+ queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
+ GFP_KERNEL);
+ if (!queue) {
+ kfree(vqs);
+ kvfree(n);
+ return -ENOMEM;
+ }
+ n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
+
dev = &n->dev;
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
@@ -838,6 +927,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
n->vqs[i].done_idx = 0;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+ vhost_net_buf_init(&n->vqs[i].rxq);
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
@@ -853,11 +943,14 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
struct socket *sock;
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
mutex_lock(&vq->mutex);
sock = vq->private_data;
vhost_net_disable_vq(n, vq);
vq->private_data = NULL;
+ vhost_net_buf_unproduce(nvq);
mutex_unlock(&vq->mutex);
return sock;
}
@@ -912,6 +1005,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
kfree(n->dev.vqs);
kvfree(n);
return 0;
@@ -950,6 +1044,25 @@ err:
return ERR_PTR(r);
}
+static struct skb_array *get_tap_skb_array(int fd)
+{
+ struct skb_array *array;
+ struct file *file = fget(fd);
+
+ if (!file)
+ return NULL;
+ array = tun_get_skb_array(file);
+ if (!IS_ERR(array))
+ goto out;
+ array = tap_get_skb_array(file);
+ if (!IS_ERR(array))
+ goto out;
+ array = NULL;
+out:
+ fput(file);
+ return array;
+}
+
static struct socket *get_tap_socket(int fd)
{
struct file *file = fget(fd);
@@ -1026,6 +1139,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
vhost_net_disable_vq(n, vq);
vq->private_data = sock;
+ vhost_net_buf_unproduce(nvq);
+ if (index == VHOST_NET_VQ_RX)
+ nvq->rx_array = get_tap_skb_array(fd);
r = vhost_vq_init_access(vq);
if (r)
goto err_used;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 3482c3c2037d..4837157da0dc 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -3,6 +3,7 @@
#if IS_ENABLED(CONFIG_TAP)
struct socket *tap_get_socket(struct file *);
+struct skb_array *tap_get_skb_array(struct file *file);
#else
#include <linux/err.h>
#include <linux/errno.h>
@@ -12,6 +13,10 @@ static inline struct socket *tap_get_socket(struct file *f)
{
return ERR_PTR(-EINVAL);
}
+static inline struct skb_array *tap_get_skb_array(struct file *f)
+{
+ return ERR_PTR(-EINVAL);
+}
#endif /* CONFIG_TAP */
#include <net/sock.h>
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index ed6da2e6df90..bf9bdf42d577 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,7 @@
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
+struct skb_array *tun_get_skb_array(struct file *file);
#else
#include <linux/err.h>
#include <linux/errno.h>
@@ -28,5 +29,9 @@ static inline struct socket *tun_get_socket(struct file *f)
{
return ERR_PTR(-EINVAL);
}
+static inline struct skb_array *tun_get_skb_array(struct file *f)
+{
+ return ERR_PTR(-EINVAL);
+}
#endif /* CONFIG_TUN */
#endif /* __IF_TUN_H */
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6b2e0dd88569..d8c97ec8a8e6 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -278,6 +278,22 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
return ptr;
}
+static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
+ void **array, int n)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ ptr = __ptr_ring_consume(r);
+ if (!ptr)
+ break;
+ array[i] = ptr;
+ }
+
+ return i;
+}
+
/*
* Note: resize (below) nests producer lock within consumer lock, so if you
* call this in interrupt or BH context, you must disable interrupts/BH when
@@ -328,6 +344,55 @@ static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
return ptr;
}
+static inline int ptr_ring_consume_batched(struct ptr_ring *r,
+ void **array, int n)
+{
+ int ret;
+
+ spin_lock(&r->consumer_lock);
+ ret = __ptr_ring_consume_batched(r, array, n);
+ spin_unlock(&r->consumer_lock);
+
+ return ret;
+}
+
+static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
+ void **array, int n)
+{
+ int ret;
+
+ spin_lock_irq(&r->consumer_lock);
+ ret = __ptr_ring_consume_batched(r, array, n);
+ spin_unlock_irq(&r->consumer_lock);
+
+ return ret;
+}
+
+static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
+ void **array, int n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&r->consumer_lock, flags);
+ ret = __ptr_ring_consume_batched(r, array, n);
+ spin_unlock_irqrestore(&r->consumer_lock, flags);
+
+ return ret;
+}
+
+static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
+ void **array, int n)
+{
+ int ret;
+
+ spin_lock_bh(&r->consumer_lock);
+ ret = __ptr_ring_consume_batched(r, array, n);
+ spin_unlock_bh(&r->consumer_lock);
+
+ return ret;
+}
+
/* Cast to structure type and call a function without discarding from FIFO.
* Function must return a value.
* Callers must take consumer_lock.
@@ -403,6 +468,61 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
return 0;
}
+/*
+ * Return entries into ring. Destroy entries that don't fit.
+ *
+ * Note: this is expected to be a rare slow path operation.
+ *
+ * Note: producer lock is nested within consumer lock, so if you
+ * resize you must make sure all uses nest correctly.
+ * In particular if you consume ring in interrupt or BH context, you must
+ * disable interrupts/BH when doing so.
+ */
+static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
+ void (*destroy)(void *))
+{
+ unsigned long flags;
+ int head;
+
+ spin_lock_irqsave(&r->consumer_lock, flags);
+ spin_lock(&r->producer_lock);
+
+ if (!r->size)
+ goto done;
+
+ /*
+ * Clean out buffered entries (for simplicity). This way following code
+ * can test entries for NULL and if not assume they are valid.
+ */
+ head = r->consumer_head - 1;
+ while (likely(head >= r->consumer_tail))
+ r->queue[head--] = NULL;
+ r->consumer_tail = r->consumer_head;
+
+ /*
+ * Go over entries in batch, start moving head back and copy entries.
+ * Stop when we run into previously unconsumed entries.
+ */
+ while (n) {
+ head = r->consumer_head - 1;
+ if (head < 0)
+ head = r->size - 1;
+ if (r->queue[head]) {
+ /* This batch entry will have to be destroyed. */
+ goto done;
+ }
+ r->queue[head] = batch[--n];
+ r->consumer_tail = r->consumer_head = head;
+ }
+
+done:
+ /* Destroy all entries left in the batch. */
+ while (n)
+ destroy(batch[--n]);
+ spin_unlock(&r->producer_lock);
+ spin_unlock_irqrestore(&r->consumer_lock, flags);
+}
+
static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
int size, gfp_t gfp,
void (*destroy)(void *))
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index f4dfade428f0..35226cd4efb0 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,21 +97,46 @@ static inline struct sk_buff *skb_array_consume(struct skb_array *a)
return ptr_ring_consume(&a->ring);
}
+static inline int skb_array_consume_batched(struct skb_array *a,
+ struct sk_buff **array, int n)
+{
+ return ptr_ring_consume_batched(&a->ring, (void **)array, n);
+}
+
static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
{
return ptr_ring_consume_irq(&a->ring);
}
+static inline int skb_array_consume_batched_irq(struct skb_array *a,
+ struct sk_buff **array, int n)
+{
+ return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n);
+}
+
static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
{
return ptr_ring_consume_any(&a->ring);
}
+static inline int skb_array_consume_batched_any(struct skb_array *a,
+ struct sk_buff **array, int n)
+{
+ return ptr_ring_consume_batched_any(&a->ring, (void **)array, n);
+}
+
+
static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
{
return ptr_ring_consume_bh(&a->ring);
}
+static inline int skb_array_consume_batched_bh(struct skb_array *a,
+ struct sk_buff **array, int n)
+{
+ return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n);
+}
+
static inline int __skb_array_len_with_tag(struct sk_buff *skb)
{
if (likely(skb)) {
@@ -156,6 +181,12 @@ static void __skb_array_destroy_skb(void *ptr)
kfree_skb(ptr);
}
+static inline void skb_array_unconsume(struct skb_array *a,
+ struct sk_buff **skbs, int n)
+{
+ ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb);
+}
+
static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
{
return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);