vhost-net: mergeable buffers support
This adds support for mergeable buffers in vhost-net: this is needed
for older guests without indirect buffer support, as well
as for zero copy with some devices.
Includes changes by Michael S. Tsirkin to make the
patch as low risk as possible (i.e., close to no changes
when feature is disabled).
Signed-off-by: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d395b592..f13e56b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@
}
return seg;
}
+/* Copy iovec entries for len bytes from iovec. */
+static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+ size_t len, int iovcount)
+{
+ int seg = 0;
+ size_t size;
+ while (len && seg < iovcount) {
+ size = min(from->iov_len, len);
+ to->iov_base = from->iov_base;
+ to->iov_len = size;
+ len -= size;
+ ++from;
+ ++to;
+ ++seg;
+ }
+}
/* Caller must have TX VQ lock */
static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@
if (wmem < sock->sk->sk_sndbuf / 2)
tx_poll_stop(net);
- hdr_size = vq->hdr_size;
+ hdr_size = vq->vhost_hlen;
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
if (unlikely(err < 0)) {
- vhost_discard_vq_desc(vq);
+ vhost_discard_vq_desc(vq, 1);
tx_poll_start(net, sock);
break;
}
@@ -191,9 +207,82 @@
unuse_mm(net->dev.mm);
}
+static int peek_head_len(struct sock *sk)
+{
+ struct sk_buff *head;
+ int len = 0;
+
+ lock_sock(sk);
+ head = skb_peek(&sk->sk_receive_queue);
+ if (head)
+ len = head->len;
+ release_sock(sk);
+ return len;
+}
+
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ * vq has read descriptors only.
+ * @vq - the relevant virtqueue
+ * @datalen - data length we'll be reading
+ * @iovcount - returned count of io vectors we fill
+ * @log - vhost log
+ * @log_num - log offset
+ * returns number of buffer heads allocated, negative on error
+ */
+static int get_rx_bufs(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ int datalen,
+ unsigned *iovcount,
+ struct vhost_log *log,
+ unsigned *log_num)
+{
+ unsigned int out, in;
+ int seg = 0;
+ int headcount = 0;
+ unsigned d;
+ int r, nlogs = 0;
+
+ while (datalen > 0) {
+ if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
+ r = -ENOBUFS;
+ goto err;
+ }
+ d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
+ ARRAY_SIZE(vq->iov) - seg, &out,
+ &in, log, log_num);
+ if (d == vq->num) {
+ r = 0;
+ goto err;
+ }
+ if (unlikely(out || in <= 0)) {
+ vq_err(vq, "unexpected descriptor format for RX: "
+ "out %d, in %d\n", out, in);
+ r = -EINVAL;
+ goto err;
+ }
+ if (unlikely(log)) {
+ nlogs += *log_num;
+ log += *log_num;
+ }
+ heads[headcount].id = d;
+ heads[headcount].len = iov_length(vq->iov + seg, in);
+ datalen -= heads[headcount].len;
+ ++headcount;
+ seg += in;
+ }
+ heads[headcount - 1].len += datalen;
+ *iovcount = seg;
+ if (unlikely(log))
+ *log_num = nlogs;
+ return headcount;
+err:
+ vhost_discard_vq_desc(vq, headcount);
+ return r;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
-static void handle_rx(struct vhost_net *net)
+static void handle_rx_big(struct vhost_net *net)
{
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
unsigned out, in, log, s;
@@ -223,7 +312,7 @@
use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
- hdr_size = vq->hdr_size;
+ hdr_size = vq->vhost_hlen;
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
@@ -270,14 +359,14 @@
len, MSG_DONTWAIT | MSG_TRUNC);
/* TODO: Check specific error and bomb out unless EAGAIN? */
if (err < 0) {
- vhost_discard_vq_desc(vq);
+ vhost_discard_vq_desc(vq, 1);
break;
}
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_debug("Discarded truncated rx packet: "
" len %d > %zd\n", err, len);
- vhost_discard_vq_desc(vq);
+ vhost_discard_vq_desc(vq, 1);
continue;
}
len = err;
@@ -302,6 +391,123 @@
unuse_mm(net->dev.mm);
}
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_rx_mergeable(struct vhost_net *net)
+{
+ struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ unsigned uninitialized_var(in), log;
+ struct vhost_log *vq_log;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_control = NULL, /* FIXME: get and handle RX aux data. */
+ .msg_controllen = 0,
+ .msg_iov = vq->iov,
+ .msg_flags = MSG_DONTWAIT,
+ };
+
+ struct virtio_net_hdr_mrg_rxbuf hdr = {
+ .hdr.flags = 0,
+ .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+ };
+
+ size_t total_len = 0;
+ int err, headcount;
+ size_t vhost_hlen, sock_hlen;
+ size_t vhost_len, sock_len;
+ struct socket *sock = rcu_dereference(vq->private_data);
+ if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+ return;
+
+ use_mm(net->dev.mm);
+ mutex_lock(&vq->mutex);
+ vhost_disable_notify(vq);
+ vhost_hlen = vq->vhost_hlen;
+ sock_hlen = vq->sock_hlen;
+
+ vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+ vq->log : NULL;
+
+ while ((sock_len = peek_head_len(sock->sk))) {
+ sock_len += sock_hlen;
+ vhost_len = sock_len + vhost_hlen;
+ headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+ &in, vq_log, &log);
+ /* On error, stop handling until the next kick. */
+ if (unlikely(headcount < 0))
+ break;
+ /* OK, now we need to know about added descriptors. */
+ if (!headcount) {
+ if (unlikely(vhost_enable_notify(vq))) {
+ /* They have slipped one in as we were
+ * doing that: check again. */
+ vhost_disable_notify(vq);
+ continue;
+ }
+ /* Nothing new? Wait for eventfd to tell us
+ * they refilled. */
+ break;
+ }
+ /* We don't need to be notified again. */
+ if (unlikely((vhost_hlen)))
+ /* Skip header. TODO: support TSO. */
+ move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+ else
+ /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
+ * needed because sendmsg can modify msg_iov. */
+ copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
+ msg.msg_iovlen = in;
+ err = sock->ops->recvmsg(NULL, sock, &msg,
+ sock_len, MSG_DONTWAIT | MSG_TRUNC);
+ /* Userspace might have consumed the packet meanwhile:
+ * it's not supposed to do this usually, but might be hard
+ * to prevent. Discard data we got (if any) and keep going. */
+ if (unlikely(err != sock_len)) {
+ pr_debug("Discarded rx packet: "
+ " len %d, expected %zd\n", err, sock_len);
+ vhost_discard_vq_desc(vq, headcount);
+ continue;
+ }
+ if (unlikely(vhost_hlen) &&
+ memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
+ vhost_hlen)) {
+ vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
+ vq->iov->iov_base);
+ break;
+ }
+ /* TODO: Should check and handle checksum. */
+ if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
+ memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
+ offsetof(typeof(hdr), num_buffers),
+ sizeof hdr.num_buffers)) {
+ vq_err(vq, "Failed num_buffers write");
+ vhost_discard_vq_desc(vq, headcount);
+ break;
+ }
+ vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ headcount);
+ if (unlikely(vq_log))
+ vhost_log_write(vq, vq_log, log, vhost_len);
+ total_len += vhost_len;
+ if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+
+ mutex_unlock(&vq->mutex);
+ unuse_mm(net->dev.mm);
+}
+
+static void handle_rx(struct vhost_net *net)
+{
+ if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
+ handle_rx_mergeable(net);
+ else
+ handle_rx_big(net);
+}
+
static void handle_tx_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -577,9 +783,21 @@
static int vhost_net_set_features(struct vhost_net *n, u64 features)
{
- size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
- sizeof(struct virtio_net_hdr) : 0;
+ size_t vhost_hlen, sock_hlen, hdr_len;
int i;
+
+ hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+ sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+ sizeof(struct virtio_net_hdr);
+ if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+ /* vhost provides vnet_hdr */
+ vhost_hlen = hdr_len;
+ sock_hlen = 0;
+ } else {
+ /* socket provides vnet_hdr */
+ vhost_hlen = 0;
+ sock_hlen = hdr_len;
+ }
mutex_lock(&n->dev.mutex);
if ((features & (1 << VHOST_F_LOG_ALL)) &&
!vhost_log_access_ok(&n->dev)) {
@@ -590,7 +808,8 @@
smp_wmb();
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
mutex_lock(&n->vqs[i].mutex);
- n->vqs[i].hdr_size = hdr_size;
+ n->vqs[i].vhost_hlen = vhost_hlen;
+ n->vqs[i].sock_hlen = sock_hlen;
mutex_unlock(&n->vqs[i].mutex);
}
vhost_net_flush(n);