arch/um/drivers/virtio_uml.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Virtio vhost-user driver
  *
  * Copyright(c) 2019 Intel Corporation
  *
  * This driver allows virtio devices to be used over a vhost-user socket.
  *
  * Guest devices can be instantiated by kernel module or command line
  * parameters. One device will be created for each parameter. Syntax:
  *
  *		virtio_uml.device=<socket>:<virtio_id>[:<platform_id>]
  * where:
  *		<socket>	:= vhost-user socket path to connect
  *		<virtio_id>	:= virtio device id (as in virtio_ids.h)
  *		<platform_id>	:= (optional) platform device id
  *
  * example:
  *		virtio_uml.device=/var/uml.socket:1
  *
  * Based on Virtio MMIO driver by Pawel Moll, copyright 2011-2014, ARM Ltd.
  */
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/time-internal.h>
 #include <shared/as-layout.h>
 #include <irq_kern.h>
 #include <init.h>
 #include <os.h>
 #include "vhost_user.h"

 /* Workaround due to a conflict between irq_user.h and irqreturn.h */
 #ifdef IRQ_NONE
 #undef IRQ_NONE
 #endif

 #define MAX_SUPPORTED_QUEUE_SIZE	256

 #define to_virtio_uml_device(_vdev) \
 	container_of(_vdev, struct virtio_uml_device, vdev)

 struct virtio_uml_platform_data {
 	u32 virtio_device_id;
 	const char *socket_path;
 	struct work_struct conn_broken_wk;
 	struct platform_device *pdev;
 };

 struct virtio_uml_device {
 	struct virtio_device vdev;
 	struct platform_device *pdev;

 	spinlock_t sock_lock;
 	int sock, req_fd;
 	u64 features;
 	u64 protocol_features;
 	u8 status;
 	u8 registered:1;
 };

 struct virtio_uml_vq_info {
 	int kick_fd, call_fd;
 	char name[32];
 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
 	struct virtqueue *vq;
 	vq_callback_t *callback;
 	struct time_travel_event defer;
 #endif
 };

 extern unsigned long long physmem_size, highmem;

 #define vu_err(vu_dev, ...)	dev_err(&(vu_dev)->pdev->dev, ##__VA_ARGS__)

 /* Vhost-user protocol */

 static int full_sendmsg_fds(int fd, const void *buf, unsigned int len,
 			    const int *fds, unsigned int fds_num)
 {
 	int rc;

 	do {
 		rc = os_sendmsg_fds(fd, buf, len, fds, fds_num);
 		if (rc > 0) {
 			buf += rc;
 			len -= rc;
 			fds = NULL;
 			fds_num = 0;
 		}
 	} while (len && (rc >= 0 || rc == -EINTR));

 	if (rc < 0)
 		return rc;
 	return 0;
 }

 static int full_read(int fd, void *buf, int len, bool abortable)
 {
 	int rc;

 	do {
 		rc = os_read_file(fd, buf, len);
 		if (rc > 0) {
 			buf += rc;
 			len -= rc;
 		}
 	} while (len && (rc > 0 || rc == -EINTR || (!abortable && rc == -EAGAIN)));

 	if (rc < 0)
 		return rc;
 	if (rc == 0)
 		return -ECONNRESET;
 	return 0;
 }

 static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg)
 {
 	return full_read(fd, msg, sizeof(msg->header), true);
 }

 static int vhost_user_recv(struct virtio_uml_device *vu_dev,
 			   int fd, struct vhost_user_msg *msg,
 			   size_t max_payload_size, bool wait)
 {
 	size_t size;
 	int rc;

 	/*
 	 * In virtio time-travel mode, we're handling all the vhost-user
 	 * FDs by polling them whenever appropriate. However, we may get
 	 * into a situation where we're sending out an interrupt message
 	 * to a device (e.g. a net device) and need to handle a simulation
 	 * time message while doing so, e.g. one that tells us to update
 	 * our idea of how long we can run without scheduling.
 	 *
 	 * Thus, we need to not just read() from the given fd, but need
 	 * to also handle messages for the simulation time - this function
 	 * does that for us while waiting for the given fd to be readable.
 	 */
 	if (wait)
 		time_travel_wait_readable(fd);

 	rc = vhost_user_recv_header(fd, msg);

 	if (rc == -ECONNRESET && vu_dev->registered) {
 		struct virtio_uml_platform_data *pdata;

 		pdata = vu_dev->pdev->dev.platform_data;

 		virtio_break_device(&vu_dev->vdev);
 		schedule_work(&pdata->conn_broken_wk);
 	}
 	if (rc)
 		return rc;
 	size = msg->header.size;
 	if (size > max_payload_size)
 		return -EPROTO;
 	return full_read(fd, &msg->payload, size, false);
 }

 static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev,
 				struct vhost_user_msg *msg,
 				size_t max_payload_size)
 {
 	int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg,
 				 max_payload_size, true);

 	if (rc)
 		return rc;

 	if (msg->header.flags != (VHOST_USER_FLAG_REPLY | VHOST_USER_VERSION))
 		return -EPROTO;

 	return 0;
 }

 static int vhost_user_recv_u64(struct virtio_uml_device *vu_dev,
 			       u64 *value)
 {
 	struct vhost_user_msg msg;
 	int rc = vhost_user_recv_resp(vu_dev, &msg,
 				      sizeof(msg.payload.integer));

 	if (rc)
 		return rc;
 	if (msg.header.size != sizeof(msg.payload.integer))
 		return -EPROTO;
 	*value = msg.payload.integer;
 	return 0;
 }

 static int vhost_user_recv_req(struct virtio_uml_device *vu_dev,
 			       struct vhost_user_msg *msg,
 			       size_t max_payload_size)
 {
 	int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg,
 				 max_payload_size, false);

 	if (rc)
 		return rc;

 	if ((msg->header.flags & ~VHOST_USER_FLAG_NEED_REPLY) !=
 			VHOST_USER_VERSION)
 		return -EPROTO;

 	return 0;
 }

 static int vhost_user_send(struct virtio_uml_device *vu_dev,
 			   bool need_response, struct vhost_user_msg *msg,
 			   int *fds, size_t num_fds)
 {
 	size_t size = sizeof(msg->header) + msg->header.size;
 	unsigned long flags;
 	bool request_ack;
 	int rc;

 	msg->header.flags |= VHOST_USER_VERSION;

 	/*
 	 * The need_response flag indicates that we already need a response,
 	 * e.g. to read the features. In these cases, don't request an ACK as
 	 * it is meaningless. Also request an ACK only if supported.
 	 */
 	request_ack = !need_response;
 	if (!(vu_dev->protocol_features &
 			BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK)))
 		request_ack = false;

 	if (request_ack)
 		msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY;

 	spin_lock_irqsave(&vu_dev->sock_lock, flags);
 	rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds);
 	if (rc < 0)
 		goto out;

 	if (request_ack) {
 		uint64_t status;

 		rc = vhost_user_recv_u64(vu_dev, &status);
 		if (rc)
 			goto out;

 		if (status) {
 			vu_err(vu_dev, "slave reports error: %llu\n", status);
 			rc = -EIO;
 			goto out;
 		}
 	}

 out:
 	spin_unlock_irqrestore(&vu_dev->sock_lock, flags);
 	return rc;
 }

 static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev,
 				      bool need_response, u32 request)
 {
 	struct vhost_user_msg msg = {
 		.header.request = request,
 	};

 	return vhost_user_send(vu_dev, need_response, &msg, NULL, 0);
 }

 static int vhost_user_send_no_payload_fd(struct virtio_uml_device *vu_dev,
 					 u32 request, int fd)
 {
 	struct vhost_user_msg msg = {
 		.header.request = request,
 	};

 	return vhost_user_send(vu_dev, false, &msg, &fd, 1);
 }

 static int vhost_user_send_u64(struct virtio_uml_device *vu_dev,
 			       u32 request, u64 value)
 {
 	struct vhost_user_msg msg = {
 		.header.request = request,
 		.header.size = sizeof(msg.payload.integer),
 		.payload.integer = value,
 	};

 	return vhost_user_send(vu_dev, false, &msg, NULL, 0);
 }

 static int vhost_user_set_owner(struct virtio_uml_device *vu_dev)
 {
 	return vhost_user_send_no_payload(vu_dev, false, VHOST_USER_SET_OWNER);
 }

 static int vhost_user_get_features(struct virtio_uml_device *vu_dev,
 				   u64 *features)
 {
 	int rc = vhost_user_send_no_payload(vu_dev, true,
 					    VHOST_USER_GET_FEATURES);

 	if (rc)
 		return rc;
 	return vhost_user_recv_u64(vu_dev, features);
 }

 static int vhost_user_set_features(struct virtio_uml_device *vu_dev,
 				   u64 features)
 {
 	return vhost_user_send_u64(vu_dev, VHOST_USER_SET_FEATURES, features);
 }

 static int vhost_user_get_protocol_features(struct virtio_uml_device *vu_dev,
 					    u64 *protocol_features)
 {
 	int rc = vhost_user_send_no_payload(vu_dev, true,
 			VHOST_USER_GET_PROTOCOL_FEATURES);

 	if (rc)
 		return rc;
 	return vhost_user_recv_u64(vu_dev, protocol_features);
 }

 static int vhost_user_set_protocol_features(struct virtio_uml_device *vu_dev,
 					    u64 protocol_features)
 {
 	return vhost_user_send_u64(vu_dev, VHOST_USER_SET_PROTOCOL_FEATURES,
 				   protocol_features);
 }

 static void vhost_user_reply(struct virtio_uml_device *vu_dev,
 			     struct vhost_user_msg *msg, int response)
 {
 	struct vhost_user_msg reply = {
 		.payload.integer = response,
 	};
 	size_t size = sizeof(reply.header) + sizeof(reply.payload.integer);
 	int rc;

 	reply.header = msg->header;
 	reply.header.flags &= ~VHOST_USER_FLAG_NEED_REPLY;
 	reply.header.flags |= VHOST_USER_FLAG_REPLY;
 	reply.header.size = sizeof(reply.payload.integer);

 	rc = full_sendmsg_fds(vu_dev->req_fd, &reply, size, NULL, 0);

 	if (rc)
 		vu_err(vu_dev,
 		       "sending reply to slave request failed: %d (size %zu)\n",
 		       rc, size);
 }

 static irqreturn_t vu_req_interrupt(int irq, void *data)
 {
 	struct virtio_uml_device *vu_dev = data;
 	struct virtqueue *vq;
 	int response = 1;
 	struct {
 		struct vhost_user_msg msg;
 		u8 extra_payload[512];
 	} msg;
 	int rc;

 	rc = vhost_user_recv_req(vu_dev, &msg.msg,
 				 sizeof(msg.msg.payload) +
 				 sizeof(msg.extra_payload));

 	if (rc)
 		return IRQ_NONE;

 	switch (msg.msg.header.request) {
 	case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG:
 		virtio_config_changed(&vu_dev->vdev);
 		response = 0;
 		break;
 	case VHOST_USER_SLAVE_VRING_CALL:
 		virtio_device_for_each_vq((&vu_dev->vdev), vq) {
 			if (vq->index == msg.msg.payload.vring_state.index) {
 				response = 0;
 				vring_interrupt(0 /* ignored */, vq);
 				break;
 			}
 		}
 		break;
 	case VHOST_USER_SLAVE_IOTLB_MSG:
 		/* not supported - VIRTIO_F_ACCESS_PLATFORM */
 	case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
 		/* not supported - VHOST_USER_PROTOCOL_F_HOST_NOTIFIER */
 	default:
 		vu_err(vu_dev, "unexpected slave request %d\n",
 		       msg.msg.header.request);
 	}

 	if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY)
 		vhost_user_reply(vu_dev, &msg.msg, response);

 	return IRQ_HANDLED;
 }

 static int vhost_user_init_slave_req(struct virtio_uml_device *vu_dev)
 {
 	int rc, req_fds[2];

 	/* Use a pipe for slave req fd, SIGIO is not supported for eventfd */
 	rc = os_pipe(req_fds, true, true);
 	if (rc < 0)
 		return rc;
 	vu_dev->req_fd = req_fds[0];

 	rc = um_request_irq(VIRTIO_IRQ, vu_dev->req_fd, IRQ_READ,
 			    vu_req_interrupt, IRQF_SHARED,
 			    vu_dev->pdev->name, vu_dev);
 	if (rc)
 		goto err_close;

 	rc = vhost_user_send_no_payload_fd(vu_dev, VHOST_USER_SET_SLAVE_REQ_FD,
 					   req_fds[1]);
 	if (rc)
 		goto err_free_irq;

 	goto out;

 err_free_irq:
 	um_free_irq(VIRTIO_IRQ, vu_dev);
 err_close:
 	os_close_file(req_fds[0]);
 out:
 	/* Close unused write end of request fds */
 	os_close_file(req_fds[1]);
 	return rc;
 }

 static int vhost_user_init(struct virtio_uml_device *vu_dev)
 {
 	int rc = vhost_user_set_owner(vu_dev);

 	if (rc)
 		return rc;
 	rc = vhost_user_get_features(vu_dev, &vu_dev->features);
 	if (rc)
 		return rc;

 	if (vu_dev->features & BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES)) {
 		rc = vhost_user_get_protocol_features(vu_dev,
 				&vu_dev->protocol_features);
 		if (rc)
 			return rc;
 		vu_dev->protocol_features &= VHOST_USER_SUPPORTED_PROTOCOL_F;
 		rc = vhost_user_set_protocol_features(vu_dev,
 				vu_dev->protocol_features);
 		if (rc)
 			return rc;
 	}

 	if (vu_dev->protocol_features &
 			BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
 		rc = vhost_user_init_slave_req(vu_dev);
 		if (rc)
 			return rc;
 	}

 	return 0;
 }

 static void vhost_user_get_config(struct virtio_uml_device *vu_dev,
 				  u32 offset, void *buf, u32 len)
 {
 	u32 cfg_size = offset + len;
 	struct vhost_user_msg *msg;
 	size_t payload_size = sizeof(msg->payload.config) + cfg_size;
 	size_t msg_size = sizeof(msg->header) + payload_size;
 	int rc;

 	if (!(vu_dev->protocol_features &
 	      BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG)))
 		return;

 	msg = kzalloc(msg_size, GFP_KERNEL);
 	if (!msg)
 		return;
 	msg->header.request = VHOST_USER_GET_CONFIG;
 	msg->header.size = payload_size;
 	msg->payload.config.offset = 0;
 	msg->payload.config.size = cfg_size;

 	rc = vhost_user_send(vu_dev, true, msg, NULL, 0);
 	if (rc) {
 		vu_err(vu_dev, "sending VHOST_USER_GET_CONFIG failed: %d\n",
 		       rc);
 		goto free;
 	}

 	rc = vhost_user_recv_resp(vu_dev, msg, msg_size);
 	if (rc) {
 		vu_err(vu_dev,
 		       "receiving VHOST_USER_GET_CONFIG response failed: %d\n",
 		       rc);
 		goto free;
 	}

 	if (msg->header.size != payload_size ||
 	    msg->payload.config.size != cfg_size) {
 		rc = -EPROTO;
 		vu_err(vu_dev,
 		       "Invalid VHOST_USER_GET_CONFIG sizes (payload %d expected %zu, config %u expected %u)\n",
 		       msg->header.size, payload_size,
 		       msg->payload.config.size, cfg_size);
 		goto free;
 	}
 	memcpy(buf, msg->payload.config.payload + offset, len);

 free:
 	kfree(msg);
 }

 static void vhost_user_set_config(struct virtio_uml_device *vu_dev,
 				  u32 offset, const void *buf, u32 len)
 {
 	struct vhost_user_msg *msg;
 	size_t payload_size = sizeof(msg->payload.config) + len;
 	size_t msg_size = sizeof(msg->header) + payload_size;
 	int rc;

 	if (!(vu_dev->protocol_features &
 	      BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG)))
 		return;

 	msg = kzalloc(msg_size, GFP_KERNEL);
 	if (!msg)
 		return;
 	msg->header.request = VHOST_USER_SET_CONFIG;
 	msg->header.size = payload_size;
 	msg->payload.config.offset = offset;
 	msg->payload.config.size = len;
 	memcpy(msg->payload.config.payload, buf, len);

 	rc = vhost_user_send(vu_dev, false, msg, NULL, 0);
 	if (rc)
 		vu_err(vu_dev, "sending VHOST_USER_SET_CONFIG failed: %d\n",
 		       rc);

 	kfree(msg);
 }

 static int vhost_user_init_mem_region(u64 addr, u64 size, int *fd_out,
 				      struct vhost_user_mem_region *region_out)
 {
 	unsigned long long mem_offset;
 	int rc = phys_mapping(addr, &mem_offset);

 	if (WARN(rc < 0, "phys_mapping of 0x%llx returned %d\n", addr, rc))
 		return -EFAULT;
 	*fd_out = rc;
 	region_out->guest_addr = addr;
 	region_out->user_addr = addr;
 	region_out->size = size;
 	region_out->mmap_offset = mem_offset;

 	/* Ensure mapping is valid for the entire region */
 	rc = phys_mapping(addr + size - 1, &mem_offset);
 	if (WARN(rc != *fd_out, "phys_mapping of 0x%llx failed: %d != %d\n",
 		 addr + size - 1, rc, *fd_out))
 		return -EFAULT;
 	return 0;
 }

 static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev)
 {
 	struct vhost_user_msg msg = {
 		.header.request = VHOST_USER_SET_MEM_TABLE,
 		.header.size = sizeof(msg.payload.mem_regions),
 		.payload.mem_regions.num = 1,
 	};
 	unsigned long reserved = uml_reserved - uml_physmem;
 	int fds[2];
 	int rc;

 	/*
 	 * This is a bit tricky, see also the comment with setup_physmem().
 	 *
 	 * Essentially, setup_physmem() uses a file to mmap() our physmem,
 	 * but the code and data we *already* have is omitted. To us, this
 	 * is no difference, since they both become part of our address
 	 * space and memory consumption. To somebody looking in from the
 	 * outside, however, it is different because the part of our memory
 	 * consumption that's already part of the binary (code/data) is not
 	 * mapped from the file, so it's not visible to another mmap from
 	 * the file descriptor.
 	 *
 	 * Thus, don't advertise this space to the vhost-user slave. This
 	 * means that the slave will likely abort or similar when we give
 	 * it an address from the hidden range, since it's not marked as
 	 * a valid address, but at least that way we detect the issue and
 	 * don't just have the slave read an all-zeroes buffer from the
 	 * shared memory file, or write something there that we can never
 	 * see (depending on the direction of the virtqueue traffic.)
 	 *
 	 * Since we usually don't want to use .text for virtio buffers,
 	 * this effectively means that you cannot use
 	 *  1) global variables, which are in the .bss and not in the shm
 	 *     file-backed memory
 	 *  2) the stack in some processes, depending on where they have
 	 *     their stack (or maybe only no interrupt stack?)
 	 *
 	 * The stack is already not typically valid for DMA, so this isn't
 	 * much of a restriction, but global variables might be encountered.
 	 *
 	 * It might be possible to fix it by copying around the data that's
 	 * between bss_start and where we map the file now, but it's not
 	 * something that you typically encounter with virtio drivers, so
 	 * it didn't seem worthwhile.
 	 */
 	rc = vhost_user_init_mem_region(reserved, physmem_size - reserved,
 					&fds[0],
 					&msg.payload.mem_regions.regions[0]);

 	if (rc < 0)
 		return rc;
 	if (highmem) {
 		msg.payload.mem_regions.num++;
 		rc = vhost_user_init_mem_region(__pa(end_iomem), highmem,
 				&fds[1], &msg.payload.mem_regions.regions[1]);
 		if (rc < 0)
 			return rc;
 	}

 	return vhost_user_send(vu_dev, false, &msg, fds,
 			       msg.payload.mem_regions.num);
 }

 static int vhost_user_set_vring_state(struct virtio_uml_device *vu_dev,
 				      u32 request, u32 index, u32 num)
 {
 	struct vhost_user_msg msg = {
 		.header.request = request,
 		.header.size = sizeof(msg.payload.vring_state),
 		.payload.vring_state.index = index,
 		.payload.vring_state.num = num,
 	};

 	return vhost_user_send(vu_dev, false, &msg, NULL, 0);
 }

 static int vhost_user_set_vring_num(struct virtio_uml_device *vu_dev,
 				    u32 index, u32 num)
 {
 	return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_NUM,
 					  index, num);
 }

 static int vhost_user_set_vring_base(struct virtio_uml_device *vu_dev,
 				     u32 index, u32 offset)
 {
 	return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_BASE,
 					  index, offset);
 }

 static int vhost_user_set_vring_addr(struct virtio_uml_device *vu_dev,
 				     u32 index, u64 desc, u64 used, u64 avail,
 				     u64 log)
 {
 	struct vhost_user_msg msg = {
 		.header.request = VHOST_USER_SET_VRING_ADDR,
 		.header.size = sizeof(msg.payload.vring_addr),
 		.payload.vring_addr.index = index,
 		.payload.vring_addr.desc = desc,
 		.payload.vring_addr.used = used,
 		.payload.vring_addr.avail = avail,
 		.payload.vring_addr.log = log,
 	};

 	return vhost_user_send(vu_dev, false, &msg, NULL, 0);
 }

 static int vhost_user_set_vring_fd(struct virtio_uml_device *vu_dev,
 				   u32 request, int index, int fd)
 {
 	struct vhost_user_msg msg = {
 		.header.request = request,
 		.header.size = sizeof(msg.payload.integer),
 		.payload.integer = index,
 	};

 	if (index & ~VHOST_USER_VRING_INDEX_MASK)
 		return -EINVAL;
 	if (fd < 0) {
 		msg.payload.integer |= VHOST_USER_VRING_POLL_MASK;
 		return vhost_user_send(vu_dev, false, &msg, NULL, 0);
 	}
 	return vhost_user_send(vu_dev, false, &msg, &fd, 1);
 }

 static int vhost_user_set_vring_call(struct virtio_uml_device *vu_dev,
 				     int index, int fd)
 {
 	return vhost_user_set_vring_fd(vu_dev, VHOST_USER_SET_VRING_CALL,
 				       index, fd);
 }

 static int vhost_user_set_vring_kick(struct virtio_uml_device *vu_dev,
 				     int index, int fd)
 {
 	return vhost_user_set_vring_fd(vu_dev, VHOST_USER_SET_VRING_KICK,
 				       index, fd);
 }

 static int vhost_user_set_vring_enable(struct virtio_uml_device *vu_dev,
 				       u32 index, bool enable)
 {
 	if (!(vu_dev->features & BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES)))
 		return 0;

 	return vhost_user_set_vring_state(vu_dev, VHOST_USER_SET_VRING_ENABLE,
 					  index, enable);
 }


 /* Virtio interface */

 static bool vu_notify(struct virtqueue *vq)
 {
 	struct virtio_uml_vq_info *info = vq->priv;
 	const uint64_t n = 1;
 	int rc;

 	time_travel_propagate_time();

 	if (info->kick_fd < 0) {
 		struct virtio_uml_device *vu_dev;

 		vu_dev = to_virtio_uml_device(vq->vdev);

 		return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK,
 						  vq->index, 0) == 0;
 	}

 	do {
 		rc = os_write_file(info->kick_fd, &n, sizeof(n));
 	} while (rc == -EINTR);
 	return !WARN(rc != sizeof(n), "write returned %d\n", rc);
 }

 static irqreturn_t vu_interrupt(int irq, void *opaque)
 {
 	struct virtqueue *vq = opaque;
 	struct virtio_uml_vq_info *info = vq->priv;
 	uint64_t n;
 	int rc;
 	irqreturn_t ret = IRQ_NONE;

 	do {
 		rc = os_read_file(info->call_fd, &n, sizeof(n));
 		if (rc == sizeof(n))
 			ret |= vring_interrupt(irq, vq);
 	} while (rc == sizeof(n) || rc == -EINTR);
 	WARN(rc != -EAGAIN, "read returned %d\n", rc);
 	return ret;
 }


 static void vu_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	vhost_user_get_config(vu_dev, offset, buf, len);
 }

 static void vu_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	vhost_user_set_config(vu_dev, offset, buf, len);
 }

 static u8 vu_get_status(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	return vu_dev->status;
 }

 static void vu_set_status(struct virtio_device *vdev, u8 status)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	vu_dev->status = status;
 }

 static void vu_reset(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	vu_dev->status = 0;
 }

 static void vu_del_vq(struct virtqueue *vq)
 {
 	struct virtio_uml_vq_info *info = vq->priv;

 	if (info->call_fd >= 0) {
 		um_free_irq(VIRTIO_IRQ, vq);
 		os_close_file(info->call_fd);
 	}

 	if (info->kick_fd >= 0)
 		os_close_file(info->kick_fd);

 	vring_del_virtqueue(vq);
 	kfree(info);
 }

 static void vu_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
 	struct virtqueue *vq, *n;
 	u64 features;

 	/* Note: reverse order as a workaround to a decoding bug in snabb */
 	list_for_each_entry_reverse(vq, &vdev->vqs, list)
 		WARN_ON(vhost_user_set_vring_enable(vu_dev, vq->index, false));

 	/* Ensure previous messages have been processed */
 	WARN_ON(vhost_user_get_features(vu_dev, &features));

 	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
 		vu_del_vq(vq);
 }

 static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev,
 			       struct virtqueue *vq)
 {
 	struct virtio_uml_vq_info *info = vq->priv;
 	int call_fds[2];
 	int rc;

 	/* no call FD needed/desired in this case */
 	if (vu_dev->protocol_features &
 			BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
 	    vu_dev->protocol_features &
 			BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
 		info->call_fd = -1;
 		return 0;
 	}

 	/* Use a pipe for call fd, since SIGIO is not supported for eventfd */
 	rc = os_pipe(call_fds, true, true);
 	if (rc < 0)
 		return rc;

 	info->call_fd = call_fds[0];
 	rc = um_request_irq(VIRTIO_IRQ, info->call_fd, IRQ_READ,
 			    vu_interrupt, IRQF_SHARED, info->name, vq);
 	if (rc)
 		goto close_both;

 	rc = vhost_user_set_vring_call(vu_dev, vq->index, call_fds[1]);
 	if (rc)
 		goto release_irq;

 	goto out;

 release_irq:
 	um_free_irq(VIRTIO_IRQ, vq);
 close_both:
 	os_close_file(call_fds[0]);
 out:
 	/* Close (unused) write end of call fds */
 	os_close_file(call_fds[1]);

 	return rc;
 }

 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
 static void vu_defer_irq_handle(struct time_travel_event *d)
 {
 	struct virtio_uml_vq_info *info;

 	info = container_of(d, struct virtio_uml_vq_info, defer);
 	info->callback(info->vq);
 }

 static void vu_defer_irq_callback(struct virtqueue *vq)
 {
 	struct virtio_uml_vq_info *info = vq->priv;

 	time_travel_add_irq_event(&info->defer);
 }
 #endif

 static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
 				     unsigned index, vq_callback_t *callback,
 				     const char *name, bool ctx)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
 	struct platform_device *pdev = vu_dev->pdev;
 	struct virtio_uml_vq_info *info;
 	struct virtqueue *vq;
 	int num = MAX_SUPPORTED_QUEUE_SIZE;
 	int rc;

 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
 		rc = -ENOMEM;
 		goto error_kzalloc;
 	}
 	snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name,
 		 pdev->id, name);

 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
 	/*
 	 * When we get an interrupt, we must bounce it through the simulation
 	 * calendar (the simtime device), except for the simtime device itself
 	 * since that's part of the simulation control.
 	 */
 	if (time_travel_mode == TT_MODE_EXTERNAL && callback) {
 		info->callback = callback;
 		callback = vu_defer_irq_callback;
 		time_travel_set_event_fn(&info->defer, vu_defer_irq_handle);
 	}
 #endif

 	vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true,
 				    ctx, vu_notify, callback, info->name);
 	if (!vq) {
 		rc = -ENOMEM;
 		goto error_create;
 	}
 	vq->priv = info;
 	num = virtqueue_get_vring_size(vq);
 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
 	info->vq = vq;
 #endif

 	if (vu_dev->protocol_features &
 			BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) {
 		info->kick_fd = -1;
 	} else {
 		rc = os_eventfd(0, 0);
 		if (rc < 0)
 			goto error_kick;
 		info->kick_fd = rc;
 	}

 	rc = vu_setup_vq_call_fd(vu_dev, vq);
 	if (rc)
 		goto error_call;

 	rc = vhost_user_set_vring_num(vu_dev, index, num);
 	if (rc)
 		goto error_setup;

 	rc = vhost_user_set_vring_base(vu_dev, index, 0);
 	if (rc)
 		goto error_setup;

 	rc = vhost_user_set_vring_addr(vu_dev, index,
 				       virtqueue_get_desc_addr(vq),
 				       virtqueue_get_used_addr(vq),
 				       virtqueue_get_avail_addr(vq),
 				       (u64) -1);
 	if (rc)
 		goto error_setup;

 	return vq;

 error_setup:
 	if (info->call_fd >= 0) {
 		um_free_irq(VIRTIO_IRQ, vq);
 		os_close_file(info->call_fd);
 	}
 error_call:
 	if (info->kick_fd >= 0)
 		os_close_file(info->kick_fd);
 error_kick:
 	vring_del_virtqueue(vq);
 error_create:
 	kfree(info);
 error_kzalloc:
 	return ERR_PTR(rc);
 }

 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[], vq_callback_t *callbacks[],
 		       const char * const names[], const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
 	int i, queue_idx = 0, rc;
 	struct virtqueue *vq;

 	rc = vhost_user_set_mem_table(vu_dev);
 	if (rc)
 		return rc;

 	for (i = 0; i < nvqs; ++i) {
 		if (!names[i]) {
 			vqs[i] = NULL;
 			continue;
 		}

 		vqs[i] = vu_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
 				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			rc = PTR_ERR(vqs[i]);
 			goto error_setup;
 		}
 	}

 	list_for_each_entry(vq, &vdev->vqs, list) {
 		struct virtio_uml_vq_info *info = vq->priv;

 		if (info->kick_fd >= 0) {
 			rc = vhost_user_set_vring_kick(vu_dev, vq->index,
 						       info->kick_fd);
 			if (rc)
 				goto error_setup;
 		}

 		rc = vhost_user_set_vring_enable(vu_dev, vq->index, true);
 		if (rc)
 			goto error_setup;
 	}

 	return 0;

 error_setup:
 	vu_del_vqs(vdev);
 	return rc;
 }

 static u64 vu_get_features(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	return vu_dev->features;
 }

 static int vu_finalize_features(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
 	u64 supported = vdev->features & VHOST_USER_SUPPORTED_F;

 	vring_transport_features(vdev);
 	vu_dev->features = vdev->features | supported;

 	return vhost_user_set_features(vu_dev, vu_dev->features);
 }

 static const char *vu_bus_name(struct virtio_device *vdev)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	return vu_dev->pdev->name;
 }

 static const struct virtio_config_ops virtio_uml_config_ops = {
 	.get = vu_get,
 	.set = vu_set,
 	.get_status = vu_get_status,
 	.set_status = vu_set_status,
 	.reset = vu_reset,
 	.find_vqs = vu_find_vqs,
 	.del_vqs = vu_del_vqs,
 	.get_features = vu_get_features,
 	.finalize_features = vu_finalize_features,
 	.bus_name = vu_bus_name,
 };

 static void virtio_uml_release_dev(struct device *d)
 {
 	struct virtio_device *vdev =
 			container_of(d, struct virtio_device, dev);
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

 	time_travel_propagate_time();

 	/* might not have been opened due to not negotiating the feature */
 	if (vu_dev->req_fd >= 0) {
 		um_free_irq(VIRTIO_IRQ, vu_dev);
 		os_close_file(vu_dev->req_fd);
 	}

 	os_close_file(vu_dev->sock);
 	kfree(vu_dev);
 }

 /* Platform device */

 static int virtio_uml_probe(struct platform_device *pdev)
 {
 	struct virtio_uml_platform_data *pdata = pdev->dev.platform_data;
 	struct virtio_uml_device *vu_dev;
 	int rc;

 	if (!pdata)
 		return -EINVAL;

 	vu_dev = kzalloc(sizeof(*vu_dev), GFP_KERNEL);
 	if (!vu_dev)
 		return -ENOMEM;

 	vu_dev->vdev.dev.parent = &pdev->dev;
 	vu_dev->vdev.dev.release = virtio_uml_release_dev;
 	vu_dev->vdev.config = &virtio_uml_config_ops;
 	vu_dev->vdev.id.device = pdata->virtio_device_id;
 	vu_dev->vdev.id.vendor = VIRTIO_DEV_ANY_ID;
 	vu_dev->pdev = pdev;
 	vu_dev->req_fd = -1;

 	time_travel_propagate_time();

 	do {
 		rc = os_connect_socket(pdata->socket_path);
 	} while (rc == -EINTR);
 	if (rc < 0)
 		goto error_free;
 	vu_dev->sock = rc;

 	spin_lock_init(&vu_dev->sock_lock);

 	rc = vhost_user_init(vu_dev);
 	if (rc)
 		goto error_init;

 	platform_set_drvdata(pdev, vu_dev);

 	rc = register_virtio_device(&vu_dev->vdev);
 	if (rc)
 		put_device(&vu_dev->vdev.dev);
 	vu_dev->registered = 1;
 	return rc;

 error_init:
 	os_close_file(vu_dev->sock);
 error_free:
 	kfree(vu_dev);
 	return rc;
 }

 static int virtio_uml_remove(struct platform_device *pdev)
 {
 	struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev);

 	unregister_virtio_device(&vu_dev->vdev);
 	return 0;
 }

 /* Command line device list */

 static void vu_cmdline_release_dev(struct device *d)
 {
 }

 static struct device vu_cmdline_parent = {
 	.init_name = "virtio-uml-cmdline",
 	.release = vu_cmdline_release_dev,
 };

 static bool vu_cmdline_parent_registered;
 static int vu_cmdline_id;

 static int vu_unregister_cmdline_device(struct device *dev, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct virtio_uml_platform_data *pdata = pdev->dev.platform_data;

 	kfree(pdata->socket_path);
 	platform_device_unregister(pdev);
 	return 0;
 }

 static void vu_conn_broken(struct work_struct *wk)
 {
 	struct virtio_uml_platform_data *pdata;

 	pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk);
 	vu_unregister_cmdline_device(&pdata->pdev->dev, NULL);
 }

 static int vu_cmdline_set(const char *device, const struct kernel_param *kp)
 {
 	const char *ids = strchr(device, ':');
 	unsigned int virtio_device_id;
 	int processed, consumed, err;
 	char *socket_path;
 	struct virtio_uml_platform_data pdata, *ppdata;
 	struct platform_device *pdev;

 	if (!ids || ids == device)
 		return -EINVAL;

 	processed = sscanf(ids, ":%u%n:%d%n",
 			   &virtio_device_id, &consumed,
 			   &vu_cmdline_id, &consumed);

 	if (processed < 1 || ids[consumed])
 		return -EINVAL;

 	if (!vu_cmdline_parent_registered) {
 		err = device_register(&vu_cmdline_parent);
 		if (err) {
 			pr_err("Failed to register parent device!\n");
 			put_device(&vu_cmdline_parent);
 			return err;
 		}
 		vu_cmdline_parent_registered = true;
 	}

 	socket_path = kmemdup_nul(device, ids - device, GFP_KERNEL);
 	if (!socket_path)
 		return -ENOMEM;

 	pdata.virtio_device_id = (u32) virtio_device_id;
 	pdata.socket_path = socket_path;

 	pr_info("Registering device virtio-uml.%d id=%d at %s\n",
 		vu_cmdline_id, virtio_device_id, socket_path);

 	pdev = platform_device_register_data(&vu_cmdline_parent, "virtio-uml",
 					     vu_cmdline_id++, &pdata,
 					     sizeof(pdata));
 	err = PTR_ERR_OR_ZERO(pdev);
 	if (err)
 		goto free;

 	ppdata = pdev->dev.platform_data;
 	ppdata->pdev = pdev;
 	INIT_WORK(&ppdata->conn_broken_wk, vu_conn_broken);

 	return 0;

 free:
 	kfree(socket_path);
 	return err;
 }

 static int vu_cmdline_get_device(struct device *dev, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct virtio_uml_platform_data *pdata = pdev->dev.platform_data;
 	char *buffer = data;
 	unsigned int len = strlen(buffer);

 	snprintf(buffer + len, PAGE_SIZE - len, "%s:%d:%d\n",
 		 pdata->socket_path, pdata->virtio_device_id, pdev->id);
 	return 0;
 }

 static int vu_cmdline_get(char *buffer, const struct kernel_param *kp)
 {
 	buffer[0] = '\0';
 	if (vu_cmdline_parent_registered)
 		device_for_each_child(&vu_cmdline_parent, buffer,
 				      vu_cmdline_get_device);
 	return strlen(buffer) + 1;
 }

 static const struct kernel_param_ops vu_cmdline_param_ops = {
 	.set = vu_cmdline_set,
 	.get = vu_cmdline_get,
 };

 device_param_cb(device, &vu_cmdline_param_ops, NULL, S_IRUSR);
 __uml_help(vu_cmdline_param_ops,
 "virtio_uml.device=<socket>:<virtio_id>[:<platform_id>]\n"
 "    Configure a virtio device over a vhost-user socket.\n"
 "    See virtio_ids.h for a list of possible virtio device id values.\n"
 "    Optionally use a specific platform_device id.\n\n"
 );


 static void vu_unregister_cmdline_devices(void)
 {
 	if (vu_cmdline_parent_registered) {
 		device_for_each_child(&vu_cmdline_parent, NULL,
 				      vu_unregister_cmdline_device);
 		device_unregister(&vu_cmdline_parent);
 		vu_cmdline_parent_registered = false;
 	}
 }

 /* Platform driver */

 static const struct of_device_id virtio_uml_match[] = {
 	{ .compatible = "virtio,uml", },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, virtio_uml_match);

 static struct platform_driver virtio_uml_driver = {
 	.probe = virtio_uml_probe,
 	.remove = virtio_uml_remove,
 	.driver = {
 		.name = "virtio-uml",
 		.of_match_table = virtio_uml_match,
 	},
 };

 static int __init virtio_uml_init(void)
 {
 	return platform_driver_register(&virtio_uml_driver);
 }

 static void __exit virtio_uml_exit(void)
 {
 	platform_driver_unregister(&virtio_uml_driver);
 	vu_unregister_cmdline_devices();
 }

 module_init(virtio_uml_init);
 module_exit(virtio_uml_exit);
 __uml_exitcall(virtio_uml_exit);

 MODULE_DESCRIPTION("UML driver for vhost-user virtio devices");
 MODULE_LICENSE("GPL");