vhost: vhost TX zero-copy support
>From: Shirley Ma <mashirle@us.ibm.com>
This adds experimental zero copy support in vhost-net,
disabled by default. To enable, set
experimental_zcopytx module option to 1.
This patch maintains the outstanding userspace buffers in the
sequence it is delivered to vhost. The outstanding userspace buffers
will be marked as done once the lower device buffers DMA has finished.
This is monitored through last reference of kfree_skb callback. Two
buffer indices are used for this purpose.
The vhost-net device passes the userspace buffers info to lower device
skb through message control. DMA done status check and guest
notification are handled by handle_tx: in the worst case is all buffers
in the vq are in pending/done status, so we need to notify guest to
release DMA done buffers first before we get any new buffers from the
vq.
One known problem is that if the guest stops submitting
buffers, buffers might never get used until some
further action, e.g. device reset. This does not
seem to affect linux guests.
Signed-off-by: Shirley <xma@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e224a92..f0fd52c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -12,6 +12,7 @@
#include <linux/virtio_net.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/rcupdate.h>
@@ -28,10 +29,18 @@
#include "vhost.h"
+static int experimental_zcopytx;
+module_param(experimental_zcopytx, int, 0444);
+MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
+
/* Max number of bytes transferred before requeueing the job.
* Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x80000
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_PEND 128
+#define VHOST_GOODCOPY_LEN 256
+
enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
@@ -54,6 +63,12 @@
enum vhost_net_poll_state tx_poll_state;
};
+static bool vhost_sock_zcopy(struct socket *sock)
+{
+ return unlikely(experimental_zcopytx) &&
+ sock_flag(sock->sk, SOCK_ZEROCOPY);
+}
+
/* Pop first len bytes from iovec. Return number of segments used. */
static int move_iovec_hdr(struct iovec *from, struct iovec *to,
size_t len, int iov_count)
@@ -129,6 +144,8 @@
int err, wmem;
size_t hdr_size;
struct socket *sock;
+ struct vhost_ubuf_ref *uninitialized_var(ubufs);
+ bool zcopy;
/* TODO: check that we are running from vhost_worker? */
sock = rcu_dereference_check(vq->private_data, 1);
@@ -149,8 +166,13 @@
if (wmem < sock->sk->sk_sndbuf / 2)
tx_poll_stop(net);
hdr_size = vq->vhost_hlen;
+ zcopy = vhost_sock_zcopy(sock);
for (;;) {
+ /* Release DMAs done buffers first */
+ if (zcopy)
+ vhost_zerocopy_signal_used(vq);
+
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
ARRAY_SIZE(vq->iov),
&out, &in,
@@ -166,6 +188,13 @@
set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
break;
}
+ /* If more outstanding DMAs, queue the work */
+ if (unlikely(vq->upend_idx - vq->done_idx >
+ VHOST_MAX_PEND)) {
+ tx_poll_start(net, sock);
+ set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+ break;
+ }
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
vhost_disable_notify(&net->dev, vq);
continue;
@@ -188,9 +217,39 @@
iov_length(vq->hdr, s), hdr_size);
break;
}
+ /* use msg_control to pass vhost zerocopy ubuf info to skb */
+ if (zcopy) {
+ vq->heads[vq->upend_idx].id = head;
+ if (len < VHOST_GOODCOPY_LEN) {
+ /* copy don't need to wait for DMA done */
+ vq->heads[vq->upend_idx].len =
+ VHOST_DMA_DONE_LEN;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ ubufs = NULL;
+ } else {
+ struct ubuf_info *ubuf = &vq->ubuf_info[head];
+
+ vq->heads[vq->upend_idx].len = len;
+ ubuf->callback = vhost_zerocopy_callback;
+ ubuf->arg = vq->ubufs;
+ ubuf->desc = vq->upend_idx;
+ msg.msg_control = ubuf;
+ msg.msg_controllen = sizeof(ubuf);
+ ubufs = vq->ubufs;
+ kref_get(&ubufs->kref);
+ }
+ vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+ }
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
+ if (zcopy) {
+ if (ubufs)
+ vhost_ubuf_put(ubufs);
+ vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
+ UIO_MAXIOV;
+ }
vhost_discard_vq_desc(vq, 1);
tx_poll_start(net, sock);
break;
@@ -198,7 +257,8 @@
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (!zcopy)
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
total_len += len;
if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
@@ -603,6 +663,7 @@
{
struct socket *sock, *oldsock;
struct vhost_virtqueue *vq;
+ struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
int r;
mutex_lock(&n->dev.mutex);
@@ -632,6 +693,13 @@
oldsock = rcu_dereference_protected(vq->private_data,
lockdep_is_held(&vq->mutex));
if (sock != oldsock) {
+ ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock));
+ if (IS_ERR(ubufs)) {
+ r = PTR_ERR(ubufs);
+ goto err_ubufs;
+ }
+ oldubufs = vq->ubufs;
+ vq->ubufs = ubufs;
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
vhost_net_enable_vq(n, vq);
@@ -639,6 +707,9 @@
mutex_unlock(&vq->mutex);
+ if (oldubufs)
+ vhost_ubuf_put_and_wait(oldubufs);
+
if (oldsock) {
vhost_net_flush_vq(n, index);
fput(oldsock->file);
@@ -647,6 +718,8 @@
mutex_unlock(&n->dev.mutex);
return 0;
+err_ubufs:
+ fput(sock->file);
err_vq:
mutex_unlock(&vq->mutex);
err:
@@ -776,6 +849,8 @@
static int vhost_net_init(void)
{
+ if (experimental_zcopytx)
+ vhost_enable_zcopy(VHOST_NET_VQ_TX);
return misc_register(&vhost_net_misc);
}
module_init(vhost_net_init);