kvm tools: add support for device passthrough using VFIO
WIP: This is not suitable for (sane) human consumption yet!
echo 0000:00:00.0 > /sys/bus/pci/devices/0000:00:00.0/driver/unbind
echo 1af4 1001 > /sys/bus/pci/drivers/vfio-pci/new_id
Then pass --vfio-groups=1 to lkvm and watch the world die a painful
death.
TODO:
- MSI-X Interrupts and irqfd
Signed-off-by: Will Deacon <will.deacon@arm.com>
diff --git a/Makefile b/Makefile
index a2901b9..c48bfd9 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,7 @@
OBJS += mmio.o
OBJS += pci.o
OBJS += term.o
+OBJS += vfio.o
OBJS += virtio/blk.o
OBJS += virtio/scsi.o
OBJS += virtio/console.o
diff --git a/arm/pci.c b/arm/pci.c
index 813df26..4b9e9b9 100644
--- a/arm/pci.c
+++ b/arm/pci.c
@@ -1,5 +1,6 @@
#include "kvm/devices.h"
#include "kvm/fdt.h"
+#include "kvm/kvm.h"
#include "kvm/of_pci.h"
#include "kvm/pci.h"
#include "kvm/util.h"
diff --git a/builtin-run.c b/builtin-run.c
index 7f3872d..e165336 100644
--- a/builtin-run.c
+++ b/builtin-run.c
@@ -158,6 +158,11 @@
OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay, \
"Delay IO by millisecond"), \
\
+ OPT_GROUP("VFIO options:"), \
+ OPT_CALLBACK('\0', "vfio-groups", NULL, "group number,...", \
+ "Pass through a list of VFIO groups to the " \
+ "virtual machine", vfio_group_parser, kvm), \
+ \
OPT_ARCH(RUN, cfg) \
OPT_END() \
};
diff --git a/include/kvm/kvm-config.h b/include/kvm/kvm-config.h
index 386fa8c..62dc6a2 100644
--- a/include/kvm/kvm-config.h
+++ b/include/kvm/kvm-config.h
@@ -2,6 +2,7 @@
#define KVM_CONFIG_H_
#include "kvm/disk-image.h"
+#include "kvm/vfio.h"
#include "kvm/kvm-config-arch.h"
#define DEFAULT_KVM_DEV "/dev/kvm"
@@ -20,9 +21,11 @@
struct kvm_config {
struct kvm_config_arch arch;
struct disk_image_params disk_image[MAX_DISK_IMAGES];
+ struct vfio_group vfio_group[MAX_VFIO_GROUPS];
u64 ram_size;
u8 image_count;
u8 num_net_devices;
+ u8 num_vfio_groups;
bool virtio_rng;
int active_console;
int debug_iodelay;
diff --git a/include/kvm/pci.h b/include/kvm/pci.h
index b0c28a1..2045ba2 100644
--- a/include/kvm/pci.h
+++ b/include/kvm/pci.h
@@ -7,7 +7,6 @@
#include <endian.h>
#include "kvm/devices.h"
-#include "kvm/kvm.h"
#include "kvm/msi.h"
/*
@@ -21,6 +20,8 @@
#define PCI_IO_SIZE 0x100
#define PCI_CFG_SIZE (1ULL << 24)
+struct kvm;
+
union pci_config_address {
struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -57,33 +58,55 @@
u32 pba_offset;
};
+#define PCI_BAR_OFFSET(b) (offsetof(struct pci_device_header, bar[b]))
+#define PCI_DEV_CFG_SIZE 256
+#define PCI_DEV_CFG_MASK (PCI_DEV_CFG_SIZE - 1)
+
+struct pci_device_header;
+
+struct pci_config_operations {
+ void (*write)(struct pci_device_header *pci_hdr, u8 offset, void *data,
+ int sz);
+ void (*read)(struct pci_device_header *pci_hdr, u8 offset, void *data,
+ int sz);
+};
+
struct pci_device_header {
- u16 vendor_id;
- u16 device_id;
- u16 command;
- u16 status;
- u8 revision_id;
- u8 class[3];
- u8 cacheline_size;
- u8 latency_timer;
- u8 header_type;
- u8 bist;
- u32 bar[6];
- u32 card_bus;
- u16 subsys_vendor_id;
- u16 subsys_id;
- u32 exp_rom_bar;
- u8 capabilities;
- u8 reserved1[3];
- u32 reserved2;
- u8 irq_line;
- u8 irq_pin;
- u8 min_gnt;
- u8 max_lat;
- struct msix_cap msix;
- u8 empty[136]; /* Rest of PCI config space */
+ /* Configuration space, as seen by the guest */
+ union {
+ struct {
+ u16 vendor_id;
+ u16 device_id;
+ u16 command;
+ u16 status;
+ u8 revision_id;
+ u8 class[3];
+ u8 cacheline_size;
+ u8 latency_timer;
+ u8 header_type;
+ u8 bist;
+ u32 bar[6];
+ u32 card_bus;
+ u16 subsys_vendor_id;
+ u16 subsys_id;
+ u32 exp_rom_bar;
+ u8 capabilities;
+ u8 reserved1[3];
+ u32 reserved2;
+ u8 irq_line;
+ u8 irq_pin;
+ u8 min_gnt;
+ u8 max_lat;
+ struct msix_cap msix;
+ } __attribute__((packed));
+ /* Pad to PCI config space size */
+ u8 __pad[PCI_DEV_CFG_SIZE];
+ };
+
+ /* Private to lkvm */
u32 bar_size[6];
-} __attribute__((packed));
+ struct pci_config_operations cfg_ops;
+};
int pci__init(struct kvm *kvm);
int pci__exit(struct kvm *kvm);
diff --git a/include/kvm/vfio.h b/include/kvm/vfio.h
new file mode 100644
index 0000000..20cc7b6
--- /dev/null
+++ b/include/kvm/vfio.h
@@ -0,0 +1,43 @@
+#ifndef KVM__VFIO_H
+#define KVM__VFIO_H
+
+#include "kvm/parse-options.h"
+#include "kvm/pci.h"
+
+#include <linux/vfio.h>
+
+#define MAX_VFIO_GROUPS 4
+
+struct vfio_pci_region_info {
+ struct vfio_region_info info;
+ u32 guest_phys_addr;
+ void *host_addr;
+};
+
+struct vfio_pci_irq_info {
+ struct vfio_irq_info info;
+ int eventfd;
+ u8 legacy_line;
+};
+
+struct vfio_device {
+ struct list_head list;
+
+ struct pci_device_header pci_hdr;
+ struct device_header dev_hdr;
+
+ int fd;
+ struct vfio_device_info info;
+ struct vfio_pci_irq_info irq;
+ struct vfio_pci_region_info *regions;
+};
+
+struct vfio_group {
+ unsigned long id; /* iommu_group number in sysfs */
+ int fd;
+ struct list_head devices;
+};
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__VFIO_H */
diff --git a/pci.c b/pci.c
index 3a6696c..03b650b 100644
--- a/pci.c
+++ b/pci.c
@@ -8,8 +8,6 @@
#include <linux/err.h>
#include <assert.h>
-#define PCI_BAR_OFFSET(b) (offsetof(struct pci_device_header, bar[b]))
-
static u32 pci_config_address_bits;
/* This is within our PCI gap - in an unused area.
@@ -131,61 +129,51 @@
void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
{
- u8 dev_num;
+ u8 offset;
+ struct pci_device_header *pci_hdr;
+ void *base;
+ u8 bar, dev_num = addr.device_number;
- dev_num = addr.device_number;
+ if (!pci_device_exists(addr.bus_number, dev_num, 0))
+ return;
- if (pci_device_exists(0, dev_num, 0)) {
- unsigned long offset;
+ offset = addr.w & PCI_DEV_CFG_MASK;
+ base = pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+ bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
- offset = addr.w & 0xff;
- if (offset < sizeof(struct pci_device_header)) {
- void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
- struct pci_device_header *hdr = p;
- u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
- u32 sz = cpu_to_le32(PCI_IO_SIZE);
+ if (pci_hdr->cfg_ops.write)
+ pci_hdr->cfg_ops.write(pci_hdr, offset, data, size);
- if (bar < 6 && hdr->bar_size[bar])
- sz = hdr->bar_size[bar];
-
- /*
- * If the kernel masks the BAR it would expect to find the
- * size of the BAR there next time it reads from it.
- * When the kernel got the size it would write the address
- * back.
- */
- if (*(u32 *)(p + offset)) {
- /* See if kernel tries to mask one of the BARs */
- if ((offset >= PCI_BAR_OFFSET(0)) &&
- (offset <= PCI_BAR_OFFSET(6)) &&
- (ioport__read32(data) == 0xFFFFFFFF))
- memcpy(p + offset, &sz, sizeof(sz));
- else
- memcpy(p + offset, data, size);
- }
- }
+ if (bar < 6 && (ioport__read32(data) == 0xFFFFFFFF)) {
+ /*
+ * If the kernel masks the BAR it would expect to
+ * find the size of the BAR there next time it reads
+ * from it. When the kernel got the size it would
+ * write the address back.
+ */
+ u32 sz = pci_hdr->bar_size[bar];
+ memcpy(base + offset, &sz, sizeof(sz));
+ } else if (*(u32 *)(base + offset)) {
+ memcpy(base + offset, data, size);
}
}
void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
{
- u8 dev_num;
+ u8 offset;
+ struct pci_device_header *pci_hdr;
+ u8 dev_num = addr.device_number;
- dev_num = addr.device_number;
-
- if (pci_device_exists(0, dev_num, 0)) {
- unsigned long offset;
-
- offset = addr.w & 0xff;
- if (offset < sizeof(struct pci_device_header)) {
- void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
-
- memcpy(data, p + offset, size);
- } else {
- memset(data, 0x00, size);
- }
- } else {
+ if (!pci_device_exists(addr.bus_number, dev_num, 0)) {
memset(data, 0xff, size);
+ } else {
+ pci_hdr = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+ offset = addr.w & PCI_DEV_CFG_MASK;
+
+ if (pci_hdr->cfg_ops.read)
+ pci_hdr->cfg_ops.read(pci_hdr, offset, data, size);
+
+ memcpy(data, (void *)pci_hdr + offset, size);
}
}
diff --git a/vfio.c b/vfio.c
new file mode 100644
index 0000000..9f1ae22
--- /dev/null
+++ b/vfio.c
@@ -0,0 +1,756 @@
+#include "kvm/vfio.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+
+#include <dirent.h>
+#include <pthread.h>
+
+#define VFIO_DEV_DIR "/dev/vfio"
+#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio"
+#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups"
+
+static int vfio_container;
+
+int vfio_group_parser(const struct option *opt, const char *arg, int unset)
+{
+ char *cur, *buf = strdup(arg);
+ int idx = 0;
+ struct kvm *kvm = opt->ptr;
+
+ cur = strtok(buf, ",");
+ while (cur && idx < MAX_VFIO_GROUPS) {
+ struct vfio_group *group = &kvm->cfg.vfio_group[idx++];
+
+ group->id = strtoul(cur, NULL, 0);
+ INIT_LIST_HEAD(&group->devices);
+ cur = strtok(NULL, ",");
+ }
+
+ if (cur)
+ pr_warning("Truncating VFIO group list to %d entries",
+ MAX_VFIO_GROUPS);
+
+ kvm->cfg.num_vfio_groups = idx;
+ free(buf);
+ return 0;
+}
+
+static void vfio_pci_cfg_read(struct pci_device_header *pci_hdr, u8 offset,
+ void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_device *device;
+ char base[sz];
+
+ device = container_of(pci_hdr, struct vfio_device, pci_hdr);
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ /* Dummy read in case of side-effects */
+ if (pread(device->fd, base, sz, info->offset + offset) != sz)
+ pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static void vfio_pci_cfg_write(struct pci_device_header *pci_hdr, u8 offset,
+ void *data, int sz)
+{
+ struct vfio_region_info *info;
+ struct vfio_device *device;
+ void *base = pci_hdr;
+
+ device = container_of(pci_hdr, struct vfio_device, pci_hdr);
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ if (pwrite(device->fd, data, sz, info->offset + offset) != sz)
+ pr_warning("Failed to write %d bytes to Configuration Space at 0x%x",
+ sz, offset);
+
+ if (pread(device->fd, base + offset, sz, info->offset + offset) != sz)
+ pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
+ sz, offset);
+}
+
+static int vfio_pci_parse_msix_cap(struct vfio_device *device)
+{
+ u8 pos, caps[2];
+ struct vfio_region_info *info;
+ ssize_t sz = sizeof(caps);
+
+ if (!(device->pci_hdr.status & PCI_STATUS_CAP_LIST))
+ return -ENODEV;
+
+ pos = device->pci_hdr.capabilities & ~3;
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+
+ while (pos) {
+ if (pread(device->fd, caps, sz, info->offset + pos) != sz) {
+ pr_warning("Failed to read from capabilities pointer (0x%x)",
+ pos);
+ return -EINVAL;
+ }
+
+ if (caps[0] != PCI_CAP_ID_MSIX) {
+ pos = caps[1];
+ continue;
+ }
+
+ /* Slurp the MSI-X capability. */
+ sz = sizeof(device->pci_hdr.msix);
+ if (pread(device->fd, &device->pci_hdr.msix, sz,
+ info->offset + pos) != sz) {
+ pr_warning("Failed to read MSI-X capability structure");
+ device->pci_hdr.msix.cap = 0;
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ return -ENODEV;
+}
+
+static int vfio_pci_parse_cfg_space(struct vfio_device *device)
+{
+ struct vfio_region_info *info;
+ ssize_t sz = PCI_DEV_CFG_SIZE;
+
+ if (device->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+ pr_err("Configuration Space not found");
+ return -ENODEV;
+ }
+
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ *info = (struct vfio_region_info) {
+ .argsz = sizeof(*info),
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ };
+
+ ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, info);
+ if (!info->size) {
+ pr_err("Configuration Space has size zero?!");
+ return -EINVAL;
+ }
+
+ if (pread(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
+ pr_err("Failed to read %zd bytes of Configuration Space", sz);
+ return -EIO;
+ }
+
+ if (device->pci_hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
+ pr_err("Unsupported header type %u",
+ device->pci_hdr.header_type);
+ return -EOPNOTSUPP;
+ }
+
+ if (vfio_pci_parse_msix_cap(device))
+ pr_warning("Failed to parse device MSI-X capability -- attempting INTx");
+
+ return 0;
+}
+
+static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
+{
+ int i;
+ struct vfio_region_info *info;
+ ssize_t sz = PCI_DEV_CFG_SIZE;
+
+ /* Enable exclusively MMIO and bus mastering */
+ device->pci_hdr.command &= ~PCI_COMMAND_IO;
+ device->pci_hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
+
+ /* Initialise the BARs */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct vfio_pci_region_info *region = &device->regions[i];
+ u32 base = region->guest_phys_addr;
+
+ if (!base)
+ continue;
+
+ device->pci_hdr.bar_size[i] = region->info.size;
+
+ /* Construct a fake reg to match what we've mapped. */
+ device->pci_hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_32;
+ }
+
+ /* I really can't be bothered to support cardbus. */
+ device->pci_hdr.card_bus = 0;
+
+ /*
+ * Nuke the expansion ROM for now. If we want to do this properly,
+ * we need to save its size somewhere and map into the guest.
+ */
+ device->pci_hdr.exp_rom_bar = 0;
+
+ /* FIXME: we don't support MSI-X yet, so nuke it */
+ device->pci_hdr.msix.cap = 0;
+
+ /* Plumb in our fake MSI-X capability, if we have it. */
+ if (device->pci_hdr.msix.cap) {
+ device->pci_hdr.capabilities =
+ (void *)&device->pci_hdr.msix - (void *)&device->pci_hdr;
+ device->pci_hdr.msix.next = 0;
+ } else {
+ device->pci_hdr.capabilities = 0;
+ }
+
+ /* Install our fake Configuration Space */
+ info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
+ if (pwrite(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
+ pr_err("Failed to write %zd bytes to Configuration Space", sz);
+ return -EIO;
+ }
+
+ /* Register callbacks for cfg accesses */
+ device->pci_hdr.cfg_ops = (struct pci_config_operations) {
+ .read = vfio_pci_cfg_read,
+ .write = vfio_pci_cfg_write,
+ };
+
+ return 0;
+}
+
+static int vfio_pci_map_bar(struct kvm *kvm, int fd,
+ struct vfio_pci_region_info *region)
+{
+ void *base;
+ int ret, prot = 0;
+
+ /*
+ * We don't want to mess about trapping BAR accesses, so require
+ * that they can be mmap'd. Note that this precludes the use of
+ * I/O BARs in the guest (we will hide them from Configuration
+ * Space, which is trapped).
+ */
+ if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
+ pr_info("Ignoring BAR %u, as it can't be mmap'd",
+ region->info.index);
+ return 0;
+ }
+
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
+ prot |= PROT_READ;
+ if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
+ prot |= PROT_WRITE;
+
+ base = mmap(NULL, region->info.size, prot, MAP_SHARED, fd,
+ region->info.offset);
+ if (base == MAP_FAILED) {
+ ret = -errno;
+ pr_err("Failed to mmap BAR region %u (0x%llx bytes)",
+ region->info.index, region->info.size);
+ return ret;
+ }
+ region->host_addr = base;
+
+ /* Grab some MMIO space in the guest */
+ region->guest_phys_addr = pci_get_io_space_block(region->info.size);
+
+ /* Register the BAR as a memory region with KVM */
+ ret = kvm__register_mem(kvm, region->guest_phys_addr, region->info.size,
+ region->host_addr);
+ if (ret) {
+ pr_err("Failed to register BAR as memory region with KVM");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_pci_configure_dev_regions(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ int ret;
+ u32 i, num_regions = device->info.num_regions;
+
+ ret = vfio_pci_parse_cfg_space(device);
+ if (ret)
+ return ret;
+
+ /* First of all, map the BARs directly into the guest */
+ for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
+ struct msix_cap *msix = &device->pci_hdr.msix;
+ struct vfio_pci_region_info *region;
+
+ if (i >= num_regions)
+ return 0;
+
+ region = &device->regions[i];
+ region->info = (struct vfio_region_info) {
+ .argsz = sizeof(*region),
+ .index = i,
+ };
+
+ ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, ®ion->info);
+ /* Ignore invalid or unimplemented regions */
+ if (!region->info.size)
+ continue;
+
+ /* Avoid trying to map MSI-X BARs */
+ if (msix->cap) {
+ if ((msix->table_offset & PCI_MSIX_TABLE_BIR) == i)
+ continue;
+ if ((msix->pba_offset & PCI_MSIX_PBA_BIR) == i)
+ continue;
+ }
+
+ /*
+ * Map the BARs into the guest. We'll later need to update
+ * configuration space to reflect our allocation.
+ */
+ ret = vfio_pci_map_bar(kvm, device->fd, region);
+ if (ret)
+ return ret;
+ }
+
+ /* We've configured the BARs, fake up a Configuration Space */
+ return vfio_pci_fixup_cfg_space(device);
+}
+
+static int vfio_configure_dev_regions(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ u32 num_regions = device->info.num_regions;
+
+ /* We only support vfio-pci devices for the moment */
+ if (!(device->info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+ pr_warning("Only vfio-pci devices are supported. "
+ "Ignoring device regions.");
+ device->info.num_regions = 0;
+ return 0;
+ }
+
+ device->regions = calloc(num_regions, sizeof(*device->regions));
+ if (!device->regions) {
+ pr_err("Failed to allocate %u regions for device",
+ num_regions);
+ return -ENOMEM;
+ }
+
+
+ return vfio_pci_configure_dev_regions(kvm, device);
+}
+
+/*
+ * FIXME: This should use KVM_IRQFD to avoid the round-trip to userspace,
+ * but that relies on CONFIG_HAVE_KVM_IRQ_ROUTING in the host
+ * (i.e. KVM_CAP_IRQ_ROUTING). Eric Auger (ST/Linaro) is working
+ * on this. Until then, make use of this horrible kludge.
+ */
+
+static int epoll_fd = -1;
+static pthread_t intx_thread;
+
+/* Alleeexxxx! */
+struct non_braindead_vfio_irq_set {
+ struct vfio_irq_set irq;
+ int fd;
+};
+
+static void *vfio_pci_intx__thread(void *param)
+{
+ struct epoll_event event;
+ struct kvm *kvm = param;
+ struct non_braindead_vfio_irq_set irq = {
+ .irq = {
+ .argsz = sizeof(irq),
+ .flags = VFIO_IRQ_SET_DATA_NONE |
+ VFIO_IRQ_SET_ACTION_UNMASK,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ },
+ };
+
+ kvm__set_thread_name("vfio-pci-intx");
+
+ for (;;) {
+ u64 tmp;
+ int nfds;
+ struct vfio_device *device;
+
+ nfds = epoll_wait(epoll_fd, &event, 1, -1);
+ if (nfds <= 0)
+ continue;
+
+ device = event.data.ptr;
+ if (read(device->irq.eventfd, &tmp, sizeof(tmp)) < 0)
+ pr_warning("Failed to read VFIO INTx event");
+
+ kvm__irq_trigger(kvm, device->irq.legacy_line);
+
+ /*
+ * We can only unmask the interrupt straight away, since
+ * there isn't a reliable way to know when the guest has
+ * de-asserted the line on the device. Unfortunately, if
+ * the guest is busy doing something else (like handling
+ * another interrupt), then we'll trigger the spurious
+ * IRQ detector in the host and the physical IRQ will be
+ * masked. Worse still, we can't ask KVM about the status
+ * of the virtual interrupt line, so all we can do is
+ * sleep for 1ms and hope for the best. IRQFD will solve
+ * this for us.
+ */
+ usleep(1000);
+ irq.fd = device->irq.eventfd;
+ if (ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq) < 0)
+ pr_warning("Failed to UNMASK IRQ in INTx loop");
+ }
+
+ return NULL;
+}
+
+static int vfio_pci_init_intx_eventfd(struct kvm *kvm,
+ struct vfio_device *device)
+{
+ int fd, ret;
+ struct non_braindead_vfio_irq_set irq;
+ struct epoll_event ev = { 0 };
+
+ /* Initialise the epoll fd and worker thread. */
+ if (epoll_fd < 0) {
+ epoll_fd = epoll_create1(0);
+ if (epoll_fd < 0) {
+ ret = -errno;
+ pr_err("Failed to create epoll descriptor for INTx thread");
+ return ret;
+ }
+
+ ret = pthread_create(&intx_thread, NULL, vfio_pci_intx__thread,
+ kvm);
+ if (ret) {
+ pr_err("Failed to start INTx thread");
+ return -ret;
+ }
+ }
+
+ /*
+ * Create an eventfd for our physical interrupt and add that to
+ * the epoll fd.
+ */
+ fd = eventfd(0, 0);
+ if (fd < 0) {
+ pr_err("Failed to create eventfd");
+ return fd;
+ }
+
+ ev.events = EPOLLIN;
+ ev.data.ptr = device;
+ device->irq.eventfd = fd;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ ret = -errno;
+ pr_err("Failed to add eventfd to epoll descriptor");
+ return ret;
+ }
+
+ /* Plumb the eventfd into the irq. */
+ irq.irq = (struct vfio_irq_set) {
+ .argsz = sizeof(irq),
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = VFIO_PCI_INTX_IRQ_INDEX,
+ .start = 0,
+ .count = 1,
+ };
+ irq.fd = fd;
+
+ ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq);
+ if (ret < 0) {
+ pr_err("Failed to setup VFIO IRQs");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
+{
+ int ret;
+
+ device->irq.info = (struct vfio_irq_info) {
+ .argsz = sizeof(device->irq.info)
+ };
+
+ if (device->pci_hdr.msix.cap) {
+ /* TODO: set up shadow PBA/table structures for MSI-X. */
+ } else {
+ /* We don't have MSI-X, so fall back on INTx */
+ pr_info("MSI-X not available for device 0x%x, falling back to INTx",
+ device->dev_hdr.dev_num);
+ device->irq.legacy_line = device->pci_hdr.irq_line;
+ device->irq.info.index = VFIO_PCI_INTX_IRQ_INDEX;
+ ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq);
+
+ if (device->irq.info.count != 1) {
+ pr_err("No INTx interrupts found");
+ return -ENODEV;
+ }
+
+ if (!(device->irq.info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ pr_err("INTx interrupt not EVENTFD capable");
+ return -EINVAL;
+ }
+
+ if (!(device->irq.info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
+ pr_err("INTx interrupt not AUTOMASKED");
+ return -EINVAL;
+ }
+
+ ret = vfio_pci_init_intx_eventfd(kvm, device);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_configure_iommu_groups(struct kvm *kvm)
+{
+ int i, ret;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ DIR *dir;
+ struct dirent *dirent;
+ char dirpath[PATH_MAX];
+ struct vfio_group *group = &kvm->cfg.vfio_group[i];
+
+ snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
+ group->id);
+
+ dir = opendir(dirpath);
+ if (!dir) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", dirpath);
+ return ret;
+ }
+
+ while ((dirent = readdir(dir))) {
+ struct vfio_device *device;
+
+ if (dirent->d_type != DT_LNK)
+ continue;
+
+ device = calloc(1, sizeof(*device));
+ if (!device) {
+ pr_err("Failed to allocate VFIO device");
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&device->list);
+ device->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
+ dirent->d_name);
+ if (device->fd < 0) {
+ ret = -errno;
+ pr_err("Failed to get FD for device %s in group %lu",
+ dirent->d_name, group->id);
+ free(device);
+ /* The device might be a bridge without an fd */
+ continue;
+ }
+
+ if (ioctl(device->fd, VFIO_DEVICE_RESET) < 0)
+ pr_warning("Failed to reset device %s in group %lu",
+ dirent->d_name, group->id);
+
+ device->info.argsz = sizeof(*device);
+ if (ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info)) {
+ ret = -errno;
+ pr_err("Failed to get info for device %s in group %lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ ret = vfio_configure_dev_regions(kvm, device);
+ if (ret) {
+ pr_err("Failed to configure regions for device %s in group %lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ device->dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &device->pci_hdr,
+ };
+
+ ret = device__register(&device->dev_hdr);
+ if (ret) {
+ pr_err("Failed to register VFIO device");
+ return ret;
+ }
+
+ ret = vfio_configure_dev_irqs(kvm, device);
+ if (ret) {
+ pr_err("Failed to configure IRQs for device %s in group%lu",
+ dirent->d_name, group->id);
+ return ret;
+ }
+
+ pr_info("Assigned device %s in group %lu to device number 0x%x",
+ dirent->d_name, group->id, device->dev_hdr.dev_num);
+
+ list_add(&device->list, &group->devices);
+ }
+
+ if (closedir(dir))
+ pr_warning("Failed to close IOMMU group %s", dirpath);
+ }
+
+ return 0;
+}
+
+/* TODO: this should be an arch callback, so arm can return HYP only if vsmmu */
+#define VFIO_TYPE1_NESTING_IOMMU 6
+static int vfio_get_iommu_type(void)
+{
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
+ return VFIO_TYPE1_NESTING_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
+ return VFIO_TYPE1v2_IOMMU;
+
+ if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
+ return VFIO_TYPE1_IOMMU;
+
+ return -ENODEV;
+}
+
+#define VFIO_PATH_MAX_LEN 16
+static int vfio_container_init(struct kvm *kvm) {
+ int api, i, ret, iommu_type;;
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .vaddr = (unsigned long)kvm->ram_start,
+ .iova = host_to_guest_flat(kvm, kvm->ram_start),
+ .size = kvm->ram_size,
+ };
+
+ /* Create a container for our IOMMU groups */
+ vfio_container = open(VFIO_DEV_NODE, O_RDWR);
+ if (vfio_container == -1) {
+ ret = errno;
+ pr_err("Failed to open %s", VFIO_DEV_NODE);
+ return ret;
+ }
+
+ api = ioctl(vfio_container, VFIO_GET_API_VERSION);
+ if (api != VFIO_API_VERSION) {
+ pr_err("Unknown VFIO API version %d", api);
+ return -ENODEV;
+ }
+
+ iommu_type = vfio_get_iommu_type();
+ if (iommu_type < 0) {
+ pr_err("VFIO type-1 IOMMU not supported on this platform");
+ return iommu_type;
+ }
+
+ /* Sanity check our groups and add them to the container */
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ char group_node[VFIO_PATH_MAX_LEN];
+ struct vfio_group *group = &kvm->cfg.vfio_group[i];
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status),
+ };
+
+ snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
+ group->id);
+
+ group->fd = open(group_node, O_RDWR);
+ if (group->fd == -1) {
+ ret = -errno;
+ pr_err("Failed to open IOMMU group %s", group_node);
+ return ret;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+ ret = -errno;
+ pr_err("Failed to determine status of IOMMU group %s",
+ group_node);
+ return ret;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ pr_err("IOMMU group %s is not viable", group_node);
+ return -EINVAL;
+ }
+
+ if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
+ ret = -errno;
+ pr_err("Failed to add IOMMU group %s to VFIO container",
+ group_node);
+ return ret;
+ }
+ }
+
+ /* Finalise the container */
+ if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
+ ret = -errno;
+ pr_err("Failed to set IOMMU type %d for VFIO container",
+ iommu_type);
+ return ret;
+ } else {
+ pr_info("Using IOMMU type %d for VFIO container",
+ iommu_type);
+ }
+
+ /* Map the guest memory for DMA (i.e. provide isolation) */
+ if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+ ret = -errno;
+ pr_err("Failed to map guest memory for DMA");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio__init(struct kvm *kvm)
+{
+ int ret;
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ ret = vfio_container_init(kvm);
+ if (ret)
+ return ret;
+
+ ret = vfio_configure_iommu_groups(kvm);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+dev_base_init(vfio__init);
+
+static int vfio__exit(struct kvm *kvm)
+{
+ int i, fd;
+
+ struct vfio_iommu_type1_dma_unmap dma_unmap = {
+ .argsz = sizeof(dma_unmap),
+ .size = kvm->ram_size,
+ .iova = host_to_guest_flat(kvm, kvm->ram_start),
+ };
+
+ if (!kvm->cfg.num_vfio_groups)
+ return 0;
+
+ for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
+ fd = kvm->cfg.vfio_group[i].fd;
+ ioctl(fd, VFIO_GROUP_UNSET_CONTAINER, &vfio_container);
+ close(fd);
+ }
+
+ ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+ return close(vfio_container);
+}
+dev_base_exit(vfio__exit);