kvm tools: add support for device passthrough using VFIO

WIP: This is not suitable for (sane) human consumption yet!

echo 0000:00:00.0 > /sys/bus/pci/devices/0000:00:00.0/driver/unbind
echo 1af4 1001 > /sys/bus/pci/drivers/vfio-pci/new_id

Then pass --vfio-groups=1 to lkvm and watch the world die a painful
death.

TODO:
 - MSI-X Interrupts and irqfd

Signed-off-by: Will Deacon <will.deacon@arm.com>
diff --git a/Makefile b/Makefile
index a2901b9..c48bfd9 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,7 @@
 OBJS	+= mmio.o
 OBJS	+= pci.o
 OBJS	+= term.o
+OBJS	+= vfio.o
 OBJS	+= virtio/blk.o
 OBJS	+= virtio/scsi.o
 OBJS	+= virtio/console.o
diff --git a/arm/pci.c b/arm/pci.c
index 813df26..4b9e9b9 100644
--- a/arm/pci.c
+++ b/arm/pci.c
@@ -1,5 +1,6 @@
 #include "kvm/devices.h"
 #include "kvm/fdt.h"
+#include "kvm/kvm.h"
 #include "kvm/of_pci.h"
 #include "kvm/pci.h"
 #include "kvm/util.h"
diff --git a/builtin-run.c b/builtin-run.c
index 7f3872d..e165336 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -158,6 +158,11 @@
 	OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay,	\
 			"Delay IO by millisecond"),			\
 									\
+	OPT_GROUP("VFIO options:"),					\
+	OPT_CALLBACK('\0', "vfio-groups", NULL, "group number,...",	\
+			"Pass through a list of VFIO groups to the "	\
+			"virtual machine", vfio_group_parser, kvm),	\
+									\
 	OPT_ARCH(RUN, cfg)						\
 	OPT_END()							\
 	};
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index 386fa8c..62dc6a2 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -2,6 +2,7 @@
 #define KVM_CONFIG_H_
 
 #include "kvm/disk-image.h"
+#include "kvm/vfio.h"
 #include "kvm/kvm-config-arch.h"
 
 #define DEFAULT_KVM_DEV		"/dev/kvm"
@@ -20,9 +21,11 @@
 struct kvm_config {
 	struct kvm_config_arch arch;
 	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	struct vfio_group vfio_group[MAX_VFIO_GROUPS];
 	u64 ram_size;
 	u8  image_count;
 	u8 num_net_devices;
+	u8 num_vfio_groups;
 	bool virtio_rng;
 	int active_console;
 	int debug_iodelay;
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index b0c28a1..2045ba2 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -7,7 +7,6 @@
 #include <endian.h>
 
 #include "kvm/devices.h"
-#include "kvm/kvm.h"
 #include "kvm/msi.h"
 
 /*
@@ -21,6 +20,8 @@
 #define PCI_IO_SIZE		0x100
 #define PCI_CFG_SIZE		(1ULL << 24)
 
+struct kvm;
+
 union pci_config_address {
 	struct {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -57,33 +58,55 @@
 	u32 pba_offset;
 };
 
+#define PCI_BAR_OFFSET(b)	(offsetof(struct pci_device_header, bar[b]))
+#define PCI_DEV_CFG_SIZE	256
+#define PCI_DEV_CFG_MASK	(PCI_DEV_CFG_SIZE - 1)
+
+struct pci_device_header;
+
+struct pci_config_operations {
+	void (*write)(struct pci_device_header *pci_hdr, u8 offset, void *data,
+		      int sz);
+	void (*read)(struct pci_device_header *pci_hdr, u8 offset, void *data,
+		     int sz);
+};
+
 struct pci_device_header {
-	u16		vendor_id;
-	u16		device_id;
-	u16		command;
-	u16		status;
-	u8		revision_id;
-	u8		class[3];
-	u8		cacheline_size;
-	u8		latency_timer;
-	u8		header_type;
-	u8		bist;
-	u32		bar[6];
-	u32		card_bus;
-	u16		subsys_vendor_id;
-	u16		subsys_id;
-	u32		exp_rom_bar;
-	u8		capabilities;
-	u8		reserved1[3];
-	u32		reserved2;
-	u8		irq_line;
-	u8		irq_pin;
-	u8		min_gnt;
-	u8		max_lat;
-	struct msix_cap msix;
-	u8		empty[136]; /* Rest of PCI config space */
+	/* Configuration space, as seen by the guest */
+	union {
+		struct {
+			u16		vendor_id;
+			u16		device_id;
+			u16		command;
+			u16		status;
+			u8		revision_id;
+			u8		class[3];
+			u8		cacheline_size;
+			u8		latency_timer;
+			u8		header_type;
+			u8		bist;
+			u32		bar[6];
+			u32		card_bus;
+			u16		subsys_vendor_id;
+			u16		subsys_id;
+			u32		exp_rom_bar;
+			u8		capabilities;
+			u8		reserved1[3];
+			u32		reserved2;
+			u8		irq_line;
+			u8		irq_pin;
+			u8		min_gnt;
+			u8		max_lat;
+			struct msix_cap msix;
+		} __attribute__((packed));
+		/* Pad to PCI config space size */
+		u8	__pad[PCI_DEV_CFG_SIZE];
+	};
+
+	/* Private to lkvm */
 	u32		bar_size[6];
-} __attribute__((packed));
+	struct pci_config_operations	cfg_ops;
+};
 
 int pci__init(struct kvm *kvm);
 int pci__exit(struct kvm *kvm);
diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
new file mode 100644
index 0000000..20cc7b6
--- /dev/null
+++ b/include/kvm/vfio.h
@@ -0,0 +1,43 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#define MAX_VFIO_GROUPS	4
+
+struct vfio_pci_region_info {
+	struct vfio_region_info		info;
+	u32				guest_phys_addr;
+	void				*host_addr;
+};
+
+struct vfio_pci_irq_info {
+	struct vfio_irq_info		info;
+	int				eventfd;
+	u8				legacy_line;
+};
+
+struct vfio_device {
+	struct list_head		list;
+
+	struct pci_device_header	pci_hdr;
+	struct device_header		dev_hdr;
+
+	int				fd;
+	struct vfio_device_info		info;
+	struct vfio_pci_irq_info	irq;
+	struct vfio_pci_region_info	*regions;
+};
+
+struct vfio_group {
+	unsigned long			id; /* iommu_group number in sysfs */
+	int				fd;
+	struct list_head		devices;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__VFIO_H */
diff --git a/pci.c b/pci.c
index 3a6696c..03b650b 100644
--- a/pci.c
+++ b/pci.c
@@ -8,8 +8,6 @@
 #include <linux/err.h>
 #include <assert.h>
 
-#define PCI_BAR_OFFSET(b)		(offsetof(struct pci_device_header, bar[b]))
-
 static u32 pci_config_address_bits;
 
 /* This is within our PCI gap - in an unused area.
@@ -131,61 +129,51 @@
 
 void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
 {
-	u8 dev_num;
+	u8 offset;
+	struct pci_device_header *pci_hdr;
+	void *base;
+	u8 bar, dev_num = addr.device_number;
 
-	dev_num	= addr.device_number;
+	if (!pci_device_exists(addr.bus_number, dev_num, 0))
+		return;
 
-	if (pci_device_exists(0, dev_num, 0)) {
-		unsigned long offset;
+	offset = addr.w & PCI_DEV_CFG_MASK;
+	base = pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+	bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
 
-		offset = addr.w & 0xff;
-		if (offset < sizeof(struct pci_device_header)) {
-			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
-			struct pci_device_header *hdr = p;
-			u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
-			u32 sz = cpu_to_le32(PCI_IO_SIZE);
+	if (pci_hdr->cfg_ops.write)
+		pci_hdr->cfg_ops.write(pci_hdr, offset, data, size);
 
-			if (bar < 6 && hdr->bar_size[bar])
-				sz = hdr->bar_size[bar];
-
-			/*
-			 * If the kernel masks the BAR it would expect to find the
-			 * size of the BAR there next time it reads from it.
-			 * When the kernel got the size it would write the address
-			 * back.
-			 */
-			if (*(u32 *)(p + offset)) {
-				/* See if kernel tries to mask one of the BARs */
-				if ((offset >= PCI_BAR_OFFSET(0)) &&
-				    (offset <= PCI_BAR_OFFSET(6)) &&
-				    (ioport__read32(data)  == 0xFFFFFFFF))
-					memcpy(p + offset, &sz, sizeof(sz));
-				    else
-					memcpy(p + offset, data, size);
-			}
-		}
+	if (bar < 6 && (ioport__read32(data) == 0xFFFFFFFF)) {
+		/*
+		 * If the kernel masks the BAR it would expect to
+		 * find the size of the BAR there next time it reads
+		 * from it. When the kernel got the size it would
+		 * write the address back.
+		 */
+		u32 sz = pci_hdr->bar_size[bar];
+		memcpy(base + offset, &sz, sizeof(sz));
+	} else if (*(u32 *)(base + offset)) {
+		memcpy(base + offset, data, size);
 	}
 }
 
 void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
 {
-	u8 dev_num;
+	u8 offset;
+	struct pci_device_header *pci_hdr;
+	u8 dev_num = addr.device_number;
 
-	dev_num	= addr.device_number;
-
-	if (pci_device_exists(0, dev_num, 0)) {
-		unsigned long offset;
-
-		offset = addr.w & 0xff;
-		if (offset < sizeof(struct pci_device_header)) {
-			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
-
-			memcpy(data, p + offset, size);
-		} else {
-			memset(data, 0x00, size);
-		}
-	} else {
+	if (!pci_device_exists(addr.bus_number, dev_num, 0)) {
 		memset(data, 0xff, size);
+	} else {
+		pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+		offset = addr.w & PCI_DEV_CFG_MASK;
+
+		if (pci_hdr->cfg_ops.read)
+			pci_hdr->cfg_ops.read(pci_hdr, offset, data, size);
+
+		memcpy(data, (void *)pci_hdr + offset, size);
 	}
 }
 
diff --git a/vfio.c b/vfio.c
new file mode 100644
index 0000000..9f1ae22
--- /dev/null
+++ b/vfio.c
@@ -0,0 +1,756 @@
+#include "kvm/vfio.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <dirent.h>
+#include <pthread.h>
+
+#define VFIO_DEV_DIR	"/dev/vfio"
+#define VFIO_DEV_NODE	VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR	"/sys/kernel/iommu_groups"
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *cur, *buf = strdup(arg);
+	int idx = 0;
+	struct kvm *kvm = opt->ptr;
+
+	cur = strtok(buf, ",");
+	while (cur && idx < MAX_VFIO_GROUPS) {
+		struct vfio_group *group = &kvm->cfg.vfio_group[idx++];
+
+		group->id = strtoul(cur, NULL, 0);
+		INIT_LIST_HEAD(&group->devices);
+		cur = strtok(NULL, ",");
+	}
+
+	if (cur)
+		pr_warning("Truncating VFIO group list to %d entries",
+				MAX_VFIO_GROUPS);
+
+	kvm->cfg.num_vfio_groups = idx;
+	free(buf);
+	return 0;
+}
+
+static void vfio_pci_cfg_read(struct pci_device_header *pci_hdr, u8 offset,
+			      void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_device *device;
+	char base[sz];
+
+	device = container_of(pci_hdr, struct vfio_device, pci_hdr);
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	/* Dummy read in case of side-effects */
+	if (pread(device->fd, base, sz, info->offset + offset) != sz)
+		pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+				sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct pci_device_header *pci_hdr, u8 offset,
+			       void *data, int sz)
+{
+	struct vfio_region_info *info;
+	struct vfio_device *device;
+	void *base = pci_hdr;
+
+	device = container_of(pci_hdr, struct vfio_device, pci_hdr);
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	if (pwrite(device->fd, data, sz, info->offset + offset) != sz)
+		pr_warning("Failed to write %d bytes to Configuration Space at 0x%x",
+				sz, offset);
+
+	if (pread(device->fd, base + offset, sz, info->offset + offset) != sz)
+		pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+				sz, offset);
+}
+
+static int vfio_pci_parse_msix_cap(struct vfio_device *device)
+{
+	u8 pos, caps[2];
+	struct vfio_region_info *info;
+	ssize_t sz = sizeof(caps);
+
+	if (!(device->pci_hdr.status & PCI_STATUS_CAP_LIST))
+		return -ENODEV;
+
+	pos = device->pci_hdr.capabilities & ~3;
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+	while (pos) {
+		if (pread(device->fd, caps, sz, info->offset + pos) != sz) {
+			pr_warning("Failed to read from capabilities pointer (0x%x)",
+				   pos);
+			return -EINVAL;
+		}
+
+		if (caps[0] != PCI_CAP_ID_MSIX) {
+			pos = caps[1];
+			continue;
+		}
+
+		/* Slurp the MSI-X capability. */
+		sz = sizeof(device->pci_hdr.msix);
+		if (pread(device->fd, &device->pci_hdr.msix, sz,
+			  info->offset + pos) != sz) {
+			pr_warning("Failed to read MSI-X capability structure");
+			device->pci_hdr.msix.cap = 0;
+			return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *device)
+{
+	struct vfio_region_info *info;
+	ssize_t sz = PCI_DEV_CFG_SIZE;
+
+	if (device->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+		pr_err("Configuration Space not found");
+		return -ENODEV;
+	}
+
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	*info = (struct vfio_region_info) {
+			.argsz = sizeof(*info),
+			.index = VFIO_PCI_CONFIG_REGION_INDEX,
+	};
+
+	ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+	if (!info->size) {
+		pr_err("Configuration Space has size zero?!");
+		return -EINVAL;
+	}
+
+	if (pread(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
+		pr_err("Failed to read %zd bytes of Configuration Space", sz);
+		return -EIO;
+	}
+
+	if (device->pci_hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
+		pr_err("Unsupported header type %u",
+			device->pci_hdr.header_type);
+		return -EOPNOTSUPP;
+	}
+
+	if (vfio_pci_parse_msix_cap(device))
+		pr_warning("Failed to parse device MSI-X capability -- attempting INTx");
+
+	return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
+{
+	int i;
+	struct vfio_region_info *info;
+	ssize_t sz = PCI_DEV_CFG_SIZE;
+
+	/* Enable exclusively MMIO and bus mastering */
+	device->pci_hdr.command &= ~PCI_COMMAND_IO;
+	device->pci_hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+	/* Initialise the BARs */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct vfio_pci_region_info *region = &device->regions[i];
+		u32 base = region->guest_phys_addr;
+
+		if (!base)
+			continue;
+
+		device->pci_hdr.bar_size[i] = region->info.size;
+
+		/* Construct a fake reg to match what we've mapped. */
+		device->pci_hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+					  PCI_BASE_ADDRESS_SPACE_MEMORY |
+					  PCI_BASE_ADDRESS_MEM_TYPE_32;
+	}
+
+	/* I really can't be bothered to support cardbus. */
+	device->pci_hdr.card_bus = 0;
+
+	/*
+	 * Nuke the expansion ROM for now. If we want to do this properly,
+	 * we need to save its size somewhere and map into the guest.
+	 */
+	device->pci_hdr.exp_rom_bar = 0;
+
+	/* FIXME: we don't support MSI-X yet, so nuke it */
+	device->pci_hdr.msix.cap = 0;
+
+	/* Plumb in our fake MSI-X capability, if we have it. */
+	if (device->pci_hdr.msix.cap) {
+		device->pci_hdr.capabilities =
+			(void *)&device->pci_hdr.msix - (void *)&device->pci_hdr;
+		device->pci_hdr.msix.next = 0;
+	} else {
+		device->pci_hdr.capabilities = 0;
+	}
+
+	/* Install our fake Configuration Space */
+	info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+	if (pwrite(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
+		pr_err("Failed to write %zd bytes to Configuration Space", sz);
+		return -EIO;
+	}
+
+	/* Register callbacks for cfg accesses */
+	device->pci_hdr.cfg_ops = (struct pci_config_operations) {
+		.read	= vfio_pci_cfg_read,
+		.write	= vfio_pci_cfg_write,
+	};
+
+	return 0;
+}
+
+static int vfio_pci_map_bar(struct kvm *kvm, int fd,
+			    struct vfio_pci_region_info *region)
+{
+	void *base;
+	int ret, prot = 0;
+
+	/*
+	 * We don't want to mess about trapping BAR accesses, so require
+	 * that they can be mmap'd. Note that this precludes the use of
+	 * I/O BARs in the guest (we will hide them from Configuration
+	 * Space, which is trapped).
+	 */
+	if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+		pr_info("Ignoring BAR %u, as it can't be mmap'd",
+			region->info.index);
+		return 0;
+	}
+
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+		prot |= PROT_READ;
+	if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+		prot |= PROT_WRITE;
+
+	base = mmap(NULL, region->info.size, prot, MAP_SHARED, fd,
+		    region->info.offset);
+	if (base == MAP_FAILED) {
+		ret = -errno;
+		pr_err("Failed to mmap BAR region %u (0x%llx bytes)",
+			region->info.index, region->info.size);
+		return ret;
+	}
+	region->host_addr = base;
+
+	/* Grab some MMIO space in the guest */
+	region->guest_phys_addr = pci_get_io_space_block(region->info.size);
+
+	/* Register the BAR as a memory region with KVM */
+	ret = kvm__register_mem(kvm, region->guest_phys_addr, region->info.size,
+				region->host_addr);
+	if (ret) {
+		pr_err("Failed to register BAR as memory region with KVM");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+					  struct vfio_device *device)
+{
+	int ret;
+	u32 i, num_regions = device->info.num_regions;
+
+	ret = vfio_pci_parse_cfg_space(device);
+	if (ret)
+		return ret;
+
+	/* First of all, map the BARs directly into the guest */
+	for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+		struct msix_cap *msix = &device->pci_hdr.msix;
+		struct vfio_pci_region_info *region;
+
+		if (i >= num_regions)
+			return 0;
+
+		region = &device->regions[i];
+		region->info = (struct vfio_region_info) {
+			.argsz = sizeof(*region),
+			.index = i,
+		};
+
+		ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, &region->info);
+		/* Ignore invalid or unimplemented regions */
+		if (!region->info.size)
+			continue;
+
+		/* Avoid trying to map MSI-X BARs */
+		if (msix->cap) {
+			if ((msix->table_offset & PCI_MSIX_TABLE_BIR) == i)
+				continue;
+			if ((msix->pba_offset & PCI_MSIX_PBA_BIR) == i)
+				continue;
+		}
+
+		/*
+		 * Map the BARs into the guest. We'll later need to update
+		 * configuration space to reflect our allocation.
+		 */
+		ret = vfio_pci_map_bar(kvm, device->fd, region);
+		if (ret)
+			return ret;
+	}
+
+	/* We've configured the BARs, fake up a Configuration Space */
+	return vfio_pci_fixup_cfg_space(device);
+}
+
+static int vfio_configure_dev_regions(struct kvm *kvm,
+				      struct vfio_device *device)
+{
+	u32 num_regions = device->info.num_regions;
+
+	/* We only support vfio-pci devices for the moment */
+	if (!(device->info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+		pr_warning("Only vfio-pci devices are supported. "
+			"Ignoring device regions.");
+		device->info.num_regions = 0;
+		return 0;
+	}
+
+	device->regions = calloc(num_regions, sizeof(*device->regions));
+	if (!device->regions) {
+		pr_err("Failed to allocate %u regions for device",
+			num_regions);
+		return -ENOMEM;
+	}
+
+
+	return vfio_pci_configure_dev_regions(kvm, device);
+}
+
+/*
+ * FIXME: This should use KVM_IRQFD to avoid the round-trip to userspace,
+ *        but that relies on CONFIG_HAVE_KVM_IRQ_ROUTING in the host
+ *        (i.e. KVM_CAP_IRQ_ROUTING). Eric Auger (ST/Linaro) is working
+ *        on this. Until then, make use of this horrible kludge.
+ */
+
+static int epoll_fd = -1;
+static pthread_t intx_thread;
+
+/* Alleeexxxx! */
+struct non_braindead_vfio_irq_set {
+	struct vfio_irq_set	irq;
+	int			fd;
+};
+
+static void *vfio_pci_intx__thread(void *param)
+{
+	struct epoll_event event;
+	struct kvm *kvm = param;
+	struct non_braindead_vfio_irq_set irq = {
+		.irq = {
+			.argsz	= sizeof(irq),
+			.flags	= VFIO_IRQ_SET_DATA_NONE |
+				  VFIO_IRQ_SET_ACTION_UNMASK,
+			.index	= VFIO_PCI_INTX_IRQ_INDEX,
+			.start	= 0,
+			.count	= 1,
+		},
+	};
+
+	kvm__set_thread_name("vfio-pci-intx");
+
+	for (;;) {
+		u64 tmp;
+		int nfds;
+		struct vfio_device *device;
+
+		nfds = epoll_wait(epoll_fd, &event, 1, -1);
+		if (nfds <= 0)
+			continue;
+
+		device = event.data.ptr;
+		if (read(device->irq.eventfd, &tmp, sizeof(tmp)) < 0)
+			pr_warning("Failed to read VFIO INTx event");
+
+		kvm__irq_trigger(kvm, device->irq.legacy_line);
+
+		/*
+		 * We can only unmask the interrupt straight away, since
+		 * there isn't a reliable way to know when the guest has
+		 * de-asserted the line on the device. Unfortunately, if
+		 * the guest is busy doing something else (like handling
+		 * another interrupt), then we'll trigger the spurious
+		 * IRQ detector in the host and the physical IRQ will be
+		 * masked. Worse still, we can't ask KVM about the status
+		 * of the virtual interrupt line, so all we can do is
+		 * sleep for 1ms and hope for the best. IRQFD will solve
+		 * this for us.
+		 */
+		usleep(1000);
+		irq.fd = device->irq.eventfd;
+		if (ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq) < 0)
+			pr_warning("Failed to UNMASK IRQ in INTx loop");
+	}
+
+	return NULL;
+}
+
+static int vfio_pci_init_intx_eventfd(struct kvm *kvm,
+				      struct vfio_device *device)
+{
+	int fd, ret;
+	struct non_braindead_vfio_irq_set irq;
+	struct epoll_event ev = { 0 };
+
+	/* Initialise the epoll fd and worker thread. */
+	if (epoll_fd < 0) {
+		epoll_fd = epoll_create1(0);
+		if (epoll_fd < 0) {
+			ret = -errno;
+			pr_err("Failed to create epoll descriptor for INTx thread");
+			return ret;
+		}
+
+		ret = pthread_create(&intx_thread, NULL, vfio_pci_intx__thread,
+				     kvm);
+		if (ret) {
+			pr_err("Failed to start INTx thread");
+			return -ret;
+		}
+	}
+
+	/*
+	 * Create an eventfd for our physical interrupt and add that to
+	 * the epoll fd.
+	 */
+	fd = eventfd(0, 0);
+	if (fd < 0) {
+		pr_err("Failed to create eventfd");
+		return fd;
+	}
+
+	ev.events		= EPOLLIN;
+	ev.data.ptr		= device;
+	device->irq.eventfd	= fd;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+		ret = -errno;
+		pr_err("Failed to add eventfd to epoll descriptor");
+		return ret;
+	}
+
+	/* Plumb the eventfd into the irq. */
+	irq.irq = (struct vfio_irq_set) {
+		.argsz	= sizeof(irq),
+		.flags	= VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+		.index	= VFIO_PCI_INTX_IRQ_INDEX,
+		.start	= 0,
+		.count	= 1,
+	};
+	irq.fd = fd;
+
+	ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq);
+	if (ret < 0) {
+		pr_err("Failed to setup VFIO IRQs");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
+{
+	int ret;
+
+	device->irq.info = (struct vfio_irq_info) {
+		.argsz = sizeof(device->irq.info)
+	};
+
+	if (device->pci_hdr.msix.cap) {
+		/* TODO: set up shadow PBA/table structures for MSI-X. */
+	} else {
+		/* We don't have MSI-X, so fall back on INTx */
+		pr_info("MSI-X not available for device 0x%x, falling back to INTx",
+			device->dev_hdr.dev_num);
+		device->irq.legacy_line = device->pci_hdr.irq_line;
+		device->irq.info.index = VFIO_PCI_INTX_IRQ_INDEX;
+		ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq);
+
+		if (device->irq.info.count != 1) {
+			pr_err("No INTx interrupts found");
+			return -ENODEV;
+		}
+
+		if (!(device->irq.info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+			pr_err("INTx interrupt not EVENTFD capable");
+			return -EINVAL;
+		}
+
+		if (!(device->irq.info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+			pr_err("INTx interrupt not AUTOMASKED");
+			return -EINVAL;
+		}
+
+		ret = vfio_pci_init_intx_eventfd(kvm, device);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+	int i, ret;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		DIR *dir;
+		struct dirent *dirent;
+		char dirpath[PATH_MAX];
+		struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+		snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+			 group->id);
+
+		dir = opendir(dirpath);
+		if (!dir) {
+			ret = -errno;
+			pr_err("Failed to open IOMMU group %s", dirpath);
+			return ret;
+		}
+
+		while ((dirent = readdir(dir))) {
+			struct vfio_device *device;
+
+			if (dirent->d_type != DT_LNK)
+				continue;
+
+			device = calloc(1, sizeof(*device));
+			if (!device) {
+				pr_err("Failed to allocate VFIO device");
+				return -ENOMEM;
+			}
+
+			INIT_LIST_HEAD(&device->list);
+			device->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
+					   dirent->d_name);
+			if (device->fd < 0) {
+				ret = -errno;
+				pr_err("Failed to get FD for device %s in group %lu",
+					dirent->d_name, group->id);
+				free(device);
+				/* The device might be a bridge without an fd */
+				continue;
+			}
+
+			if (ioctl(device->fd, VFIO_DEVICE_RESET) < 0)
+				pr_warning("Failed to reset device %s in group %lu",
+						dirent->d_name, group->id);
+
+			device->info.argsz = sizeof(*device);
+			if (ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info)) {
+				ret = -errno;
+				pr_err("Failed to get info for device %s in group %lu",
+					dirent->d_name, group->id);
+				return ret;
+			}
+
+			ret = vfio_configure_dev_regions(kvm, device);
+			if (ret) {
+				pr_err("Failed to configure regions for device %s in group %lu",
+					dirent->d_name, group->id);
+				return ret;
+			}
+
+			device->dev_hdr = (struct device_header) {
+				.bus_type	= DEVICE_BUS_PCI,
+				.data		= &device->pci_hdr,
+			};
+
+			ret = device__register(&device->dev_hdr);
+			if (ret) {
+				pr_err("Failed to register VFIO device");
+				return ret;
+			}
+
+			ret = vfio_configure_dev_irqs(kvm, device);
+			if (ret) {
+				pr_err("Failed to configure IRQs for device %s in group%lu",
+					dirent->d_name, group->id);
+				return ret;
+			}
+
+			pr_info("Assigned device %s in group %lu to device number 0x%x",
+				dirent->d_name, group->id, device->dev_hdr.dev_num);
+
+			list_add(&device->list, &group->devices);
+		}
+
+		if (closedir(dir))
+			pr_warning("Failed to close IOMMU group %s", dirpath);
+	}
+
+	return 0;
+}
+
+/* TODO: this should be an arch callback, so arm can return HYP only if vsmmu */
+#define VFIO_TYPE1_NESTING_IOMMU	6
+static int vfio_get_iommu_type(void)
+{
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+		return VFIO_TYPE1_NESTING_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+		return VFIO_TYPE1v2_IOMMU;
+
+	if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+		return VFIO_TYPE1_IOMMU;
+
+	return -ENODEV;
+}
+
+#define VFIO_PATH_MAX_LEN 16
+static int vfio_container_init(struct kvm *kvm) {
+	int api, i, ret, iommu_type;;
+	struct vfio_iommu_type1_dma_map dma_map = {
+		.argsz	= sizeof(dma_map),
+		.flags	= VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+		.vaddr	= (unsigned long)kvm->ram_start,
+		.iova	= host_to_guest_flat(kvm, kvm->ram_start),
+		.size	= kvm->ram_size,
+	};
+
+	/* Create a container for our IOMMU groups */
+	vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+	if (vfio_container == -1) {
+		ret = errno;
+		pr_err("Failed to open %s", VFIO_DEV_NODE);
+		return ret;
+	}
+
+	api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+	if (api != VFIO_API_VERSION) {
+		pr_err("Unknown VFIO API version %d", api);
+		return -ENODEV;
+	}
+
+	iommu_type = vfio_get_iommu_type();
+	if (iommu_type < 0) {
+		pr_err("VFIO type-1 IOMMU not supported on this platform");
+		return iommu_type;
+	}
+
+	/* Sanity check our groups and add them to the container */
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		char group_node[VFIO_PATH_MAX_LEN];
+		struct vfio_group *group = &kvm->cfg.vfio_group[i];
+		struct vfio_group_status group_status = {
+			.argsz = sizeof(group_status),
+		};
+
+		snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+			 group->id);
+
+		group->fd = open(group_node, O_RDWR);
+		if (group->fd == -1) {
+			ret = -errno;
+			pr_err("Failed to open IOMMU group %s", group_node);
+			return ret;
+		}
+
+		if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+			ret = -errno;
+			pr_err("Failed to determine status of IOMMU group %s",
+				group_node);
+			return ret;
+		}
+
+		if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+			pr_err("IOMMU group %s is not viable", group_node);
+			return -EINVAL;
+		}
+
+		if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+			ret = -errno;
+			pr_err("Failed to add IOMMU group %s to VFIO container",
+				group_node);
+			return ret;
+		}
+	}
+
+	/* Finalise the container */
+	if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+		ret = -errno;
+		pr_err("Failed to set IOMMU type %d for VFIO container",
+			iommu_type);
+		return ret;
+	} else {
+		pr_info("Using IOMMU type %d for VFIO container",
+			iommu_type);
+	}
+
+	/* Map the guest memory for DMA (i.e. provide isolation) */
+	if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+		ret = -errno;
+		pr_err("Failed to map guest memory for DMA");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	ret = vfio_container_init(kvm);
+	if (ret)
+		return ret;
+
+	ret = vfio_configure_iommu_groups(kvm);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+	int i, fd;
+
+	struct vfio_iommu_type1_dma_unmap dma_unmap = {
+		.argsz = sizeof(dma_unmap),
+		.size = kvm->ram_size,
+		.iova = host_to_guest_flat(kvm, kvm->ram_start),
+	};
+
+	if (!kvm->cfg.num_vfio_groups)
+		return 0;
+
+	for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+		fd = kvm->cfg.vfio_group[i].fd;
+		ioctl(fd, VFIO_GROUP_UNSET_CONTAINER, &vfio_container);
+		close(fd);
+	}
+
+	ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+	return close(vfio_container);
+}
+dev_base_exit(vfio__exit);