blob: 88b7ef6f07652ea7d19db6e5347bd518b0737748 [file] [log] [blame]
#include "kvm/vfio.h"
#include "kvm/kvm.h"
#include "kvm/pci.h"
#include "kvm/util.h"
#include <linux/list.h>
#include <linux/kvm.h>
#include <linux/pci_regs.h>
#include <sys/epoll.h>
#include <sys/eventfd.h>
#include <dirent.h>
#include <pthread.h>
#define VFIO_DEV_DIR "/dev/vfio"
#define VFIO_DEV_NODE VFIO_DEV_DIR "/vfio"
#define IOMMU_GROUP_DIR "/sys/kernel/iommu_groups"
static int vfio_container;
int vfio_group_parser(const struct option *opt, const char *arg, int unset)
{
char *cur, *buf = strdup(arg);
int idx = 0;
struct kvm *kvm = opt->ptr;
cur = strtok(buf, ",");
while (cur && idx < MAX_VFIO_GROUPS) {
struct vfio_group *group = &kvm->cfg.vfio_group[idx++];
group->id = strtoul(cur, NULL, 0);
INIT_LIST_HEAD(&group->devices);
cur = strtok(NULL, ",");
}
if (cur)
pr_warning("Truncating VFIO group list to %d entries",
MAX_VFIO_GROUPS);
kvm->cfg.num_vfio_groups = idx;
free(buf);
return 0;
}
static void vfio_pci_cfg_read(struct pci_device_header *pci_hdr, u8 offset,
void *data, int sz)
{
struct vfio_region_info *info;
struct vfio_device *device;
char base[sz];
device = container_of(pci_hdr, struct vfio_device, pci_hdr);
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
/* Dummy read in case of side-effects */
if (pread(device->fd, base, sz, info->offset + offset) != sz)
pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
sz, offset);
}
static void vfio_pci_cfg_write(struct pci_device_header *pci_hdr, u8 offset,
void *data, int sz)
{
struct vfio_region_info *info;
struct vfio_device *device;
void *base = pci_hdr;
device = container_of(pci_hdr, struct vfio_device, pci_hdr);
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
if (pwrite(device->fd, data, sz, info->offset + offset) != sz)
pr_warning("Failed to write %d bytes to Configuration Space at 0x%x",
sz, offset);
if (pread(device->fd, base + offset, sz, info->offset + offset) != sz)
pr_warning("Failed to read %d bytes from Configuration Space at 0x%x",
sz, offset);
}
static int vfio_pci_parse_msix_cap(struct vfio_device *device)
{
u8 pos, caps[2];
struct vfio_region_info *info;
ssize_t sz = sizeof(caps);
if (!(device->pci_hdr.status & PCI_STATUS_CAP_LIST))
return -ENODEV;
pos = device->pci_hdr.capabilities & ~3;
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
while (pos) {
if (pread(device->fd, caps, sz, info->offset + pos) != sz) {
pr_warning("Failed to read from capabilities pointer (0x%x)",
pos);
return -EINVAL;
}
if (caps[0] != PCI_CAP_ID_MSIX) {
pos = caps[1];
continue;
}
/* Slurp the MSI-X capability. */
sz = sizeof(device->pci_hdr.msix);
if (pread(device->fd, &device->pci_hdr.msix, sz,
info->offset + pos) != sz) {
pr_warning("Failed to read MSI-X capability structure");
device->pci_hdr.msix.cap = 0;
return -EINVAL;
}
return 0;
}
return -ENODEV;
}
static int vfio_pci_parse_cfg_space(struct vfio_device *device)
{
struct vfio_region_info *info;
ssize_t sz = PCI_DEV_CFG_SIZE;
if (device->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
pr_err("Configuration Space not found");
return -ENODEV;
}
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
*info = (struct vfio_region_info) {
.argsz = sizeof(*info),
.index = VFIO_PCI_CONFIG_REGION_INDEX,
};
ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, info);
if (!info->size) {
pr_err("Configuration Space has size zero?!");
return -EINVAL;
}
if (pread(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
pr_err("Failed to read %zd bytes of Configuration Space", sz);
return -EIO;
}
if (device->pci_hdr.header_type != PCI_HEADER_TYPE_NORMAL) {
pr_err("Unsupported header type %u",
device->pci_hdr.header_type);
return -EOPNOTSUPP;
}
if (vfio_pci_parse_msix_cap(device))
pr_warning("Failed to parse device MSI-X capability -- attempting INTx");
return 0;
}
static int vfio_pci_fixup_cfg_space(struct vfio_device *device)
{
int i;
struct vfio_region_info *info;
ssize_t sz = PCI_DEV_CFG_SIZE;
/* Enable exclusively MMIO and bus mastering */
device->pci_hdr.command &= ~PCI_COMMAND_IO;
device->pci_hdr.command |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
/* Initialise the BARs */
for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
struct vfio_pci_region_info *region = &device->regions[i];
u32 base = region->guest_phys_addr;
if (!base)
continue;
device->pci_hdr.bar_size[i] = region->info.size;
/* Construct a fake reg to match what we've mapped. */
device->pci_hdr.bar[i] = (base & PCI_BASE_ADDRESS_MEM_MASK) |
PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_32;
}
/* I really can't be bothered to support cardbus. */
device->pci_hdr.card_bus = 0;
/*
* Nuke the expansion ROM for now. If we want to do this properly,
* we need to save its size somewhere and map into the guest.
*/
device->pci_hdr.exp_rom_bar = 0;
/* FIXME: we don't support MSI-X yet, so nuke it */
device->pci_hdr.msix.cap = 0;
/* Plumb in our fake MSI-X capability, if we have it. */
if (device->pci_hdr.msix.cap) {
device->pci_hdr.capabilities =
(void *)&device->pci_hdr.msix - (void *)&device->pci_hdr;
device->pci_hdr.msix.next = 0;
} else {
device->pci_hdr.capabilities = 0;
}
/* Install our fake Configuration Space */
info = &device->regions[VFIO_PCI_CONFIG_REGION_INDEX].info;
if (pwrite(device->fd, &device->pci_hdr, sz, info->offset) != sz) {
pr_err("Failed to write %zd bytes to Configuration Space", sz);
return -EIO;
}
/* Register callbacks for cfg accesses */
device->pci_hdr.cfg_ops = (struct pci_config_operations) {
.read = vfio_pci_cfg_read,
.write = vfio_pci_cfg_write,
};
return 0;
}
static int vfio_pci_map_bar(struct kvm *kvm, int fd,
struct vfio_pci_region_info *region)
{
void *base;
int ret, prot = 0;
u64 map_size = ALIGN(region->info.size, PAGE_SIZE);
/*
* We don't want to mess about trapping BAR accesses, so require
* that they can be mmap'd. Note that this precludes the use of
* I/O BARs in the guest (we will hide them from Configuration
* Space, which is trapped).
*/
if (!(region->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) {
pr_info("Ignoring BAR %u, as it can't be mmap'd",
region->info.index);
return 0;
}
if (region->info.flags & VFIO_REGION_INFO_FLAG_READ)
prot |= PROT_READ;
if (region->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
prot |= PROT_WRITE;
base = mmap(NULL, region->info.size, prot, MAP_SHARED, fd,
region->info.offset);
if (base == MAP_FAILED) {
ret = -errno;
pr_err("Failed to mmap BAR region %u (0x%llx bytes)",
region->info.index, region->info.size);
return ret;
}
region->host_addr = base;
/* Grab some MMIO space in the guest */
region->guest_phys_addr = pci_get_io_space_block(map_size);
/* Register the BAR as a memory region with KVM */
ret = kvm__register_mem(kvm, region->guest_phys_addr, map_size,
region->host_addr);
if (ret) {
pr_err("Failed to register BAR as memory region with KVM");
return ret;
}
return 0;
}
static int vfio_pci_configure_dev_regions(struct kvm *kvm,
struct vfio_device *device)
{
int ret;
u32 i, num_regions = device->info.num_regions;
ret = vfio_pci_parse_cfg_space(device);
if (ret)
return ret;
/* First of all, map the BARs directly into the guest */
for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) {
struct msix_cap *msix = &device->pci_hdr.msix;
struct vfio_pci_region_info *region;
if (i >= num_regions)
return 0;
region = &device->regions[i];
region->info = (struct vfio_region_info) {
.argsz = sizeof(*region),
.index = i,
};
ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, &region->info);
/* Ignore invalid or unimplemented regions */
if (!region->info.size)
continue;
/* Avoid trying to map MSI-X BARs */
if (msix->cap) {
if ((msix->table_offset & PCI_MSIX_TABLE_BIR) == i)
continue;
if ((msix->pba_offset & PCI_MSIX_PBA_BIR) == i)
continue;
}
/*
* Map the BARs into the guest. We'll later need to update
* configuration space to reflect our allocation.
*/
ret = vfio_pci_map_bar(kvm, device->fd, region);
if (ret)
return ret;
}
/* We've configured the BARs, fake up a Configuration Space */
return vfio_pci_fixup_cfg_space(device);
}
static int vfio_configure_dev_regions(struct kvm *kvm,
struct vfio_device *device)
{
u32 num_regions = device->info.num_regions;
/* We only support vfio-pci devices for the moment */
if (!(device->info.flags & VFIO_DEVICE_FLAGS_PCI)) {
pr_warning("Only vfio-pci devices are supported. "
"Ignoring device regions.");
device->info.num_regions = 0;
return 0;
}
device->regions = calloc(num_regions, sizeof(*device->regions));
if (!device->regions) {
pr_err("Failed to allocate %u regions for device",
num_regions);
return -ENOMEM;
}
return vfio_pci_configure_dev_regions(kvm, device);
}
/*
* FIXME: This should use KVM_IRQFD to avoid the round-trip to userspace,
* but that relies on CONFIG_HAVE_KVM_IRQ_ROUTING in the host
* (i.e. KVM_CAP_IRQ_ROUTING). Eric Auger (ST/Linaro) is working
* on this. Until then, make use of this horrible kludge.
*/
static int epoll_fd = -1;
static pthread_t intx_thread;
/* Alleeexxxx! */
struct non_braindead_vfio_irq_set {
struct vfio_irq_set irq;
int fd;
};
static void *vfio_pci_intx__thread(void *param)
{
struct epoll_event event;
struct kvm *kvm = param;
struct non_braindead_vfio_irq_set irq = {
.irq = {
.argsz = sizeof(irq),
.flags = VFIO_IRQ_SET_DATA_NONE |
VFIO_IRQ_SET_ACTION_UNMASK,
.index = VFIO_PCI_INTX_IRQ_INDEX,
.start = 0,
.count = 1,
},
};
kvm__set_thread_name("vfio-pci-intx");
for (;;) {
u64 tmp;
int nfds;
struct vfio_device *device;
nfds = epoll_wait(epoll_fd, &event, 1, -1);
if (nfds <= 0)
continue;
device = event.data.ptr;
if (read(device->irq.eventfd, &tmp, sizeof(tmp)) < 0)
pr_warning("Failed to read VFIO INTx event");
kvm__irq_trigger(kvm, device->irq.legacy_line);
/*
* We can only unmask the interrupt straight away, since
* there isn't a reliable way to know when the guest has
* de-asserted the line on the device. Unfortunately, if
* the guest is busy doing something else (like handling
* another interrupt), then we'll trigger the spurious
* IRQ detector in the host and the physical IRQ will be
* masked. Worse still, we can't ask KVM about the status
* of the virtual interrupt line, so all we can do is
* sleep for 1ms and hope for the best. IRQFD will solve
* this for us.
*/
usleep(1000);
irq.fd = device->irq.eventfd;
if (ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq) < 0)
pr_warning("Failed to UNMASK IRQ in INTx loop");
}
return NULL;
}
static int vfio_pci_init_intx_eventfd(struct kvm *kvm,
struct vfio_device *device)
{
int fd, ret;
struct non_braindead_vfio_irq_set irq;
struct epoll_event ev = { 0 };
/* Initialise the epoll fd and worker thread. */
if (epoll_fd < 0) {
epoll_fd = epoll_create1(0);
if (epoll_fd < 0) {
ret = -errno;
pr_err("Failed to create epoll descriptor for INTx thread");
return ret;
}
ret = pthread_create(&intx_thread, NULL, vfio_pci_intx__thread,
kvm);
if (ret) {
pr_err("Failed to start INTx thread");
return -ret;
}
}
/*
* Create an eventfd for our physical interrupt and add that to
* the epoll fd.
*/
fd = eventfd(0, 0);
if (fd < 0) {
pr_err("Failed to create eventfd");
return fd;
}
ev.events = EPOLLIN;
ev.data.ptr = device;
device->irq.eventfd = fd;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0) {
ret = -errno;
pr_err("Failed to add eventfd to epoll descriptor");
return ret;
}
/* Plumb the eventfd into the irq. */
irq.irq = (struct vfio_irq_set) {
.argsz = sizeof(irq),
.flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
.index = VFIO_PCI_INTX_IRQ_INDEX,
.start = 0,
.count = 1,
};
irq.fd = fd;
ret = ioctl(device->fd, VFIO_DEVICE_SET_IRQS, &irq.irq);
if (ret < 0) {
pr_err("Failed to setup VFIO IRQs");
return ret;
}
return 0;
}
static int vfio_configure_dev_irqs(struct kvm *kvm, struct vfio_device *device)
{
int ret;
device->irq.info = (struct vfio_irq_info) {
.argsz = sizeof(device->irq.info)
};
if (device->pci_hdr.msix.cap) {
/* TODO: set up shadow PBA/table structures for MSI-X. */
} else {
/* We don't have MSI-X, so fall back on INTx */
pr_info("MSI-X not available for device 0x%x, falling back to INTx",
device->dev_hdr.dev_num);
device->irq.legacy_line = device->pci_hdr.irq_line;
device->irq.info.index = VFIO_PCI_INTX_IRQ_INDEX;
ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &device->irq);
if (device->irq.info.count != 1) {
pr_err("No INTx interrupts found");
return -ENODEV;
}
if (!(device->irq.info.flags & VFIO_IRQ_INFO_EVENTFD)) {
pr_err("INTx interrupt not EVENTFD capable");
return -EINVAL;
}
if (!(device->irq.info.flags & VFIO_IRQ_INFO_AUTOMASKED)) {
pr_err("INTx interrupt not AUTOMASKED");
return -EINVAL;
}
ret = vfio_pci_init_intx_eventfd(kvm, device);
if (ret)
return ret;
}
return 0;
}
static int vfio_configure_iommu_groups(struct kvm *kvm)
{
int i, ret;
for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
DIR *dir;
struct dirent *dirent;
char dirpath[PATH_MAX];
struct vfio_group *group = &kvm->cfg.vfio_group[i];
snprintf(dirpath, PATH_MAX, IOMMU_GROUP_DIR "/%lu/devices",
group->id);
dir = opendir(dirpath);
if (!dir) {
ret = -errno;
pr_err("Failed to open IOMMU group %s", dirpath);
return ret;
}
while ((dirent = readdir(dir))) {
struct vfio_device *device;
if (dirent->d_type != DT_LNK)
continue;
device = calloc(1, sizeof(*device));
if (!device) {
pr_err("Failed to allocate VFIO device");
return -ENOMEM;
}
INIT_LIST_HEAD(&device->list);
device->fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD,
dirent->d_name);
if (device->fd < 0) {
ret = -errno;
pr_err("Failed to get FD for device %s in group %lu",
dirent->d_name, group->id);
free(device);
/* The device might be a bridge without an fd */
continue;
}
if (ioctl(device->fd, VFIO_DEVICE_RESET) < 0)
pr_warning("Failed to reset device %s in group %lu",
dirent->d_name, group->id);
device->info.argsz = sizeof(*device);
if (ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info)) {
ret = -errno;
pr_err("Failed to get info for device %s in group %lu",
dirent->d_name, group->id);
return ret;
}
ret = vfio_configure_dev_regions(kvm, device);
if (ret) {
pr_err("Failed to configure regions for device %s in group %lu",
dirent->d_name, group->id);
return ret;
}
device->dev_hdr = (struct device_header) {
.bus_type = DEVICE_BUS_PCI,
.data = &device->pci_hdr,
};
ret = device__register(&device->dev_hdr);
if (ret) {
pr_err("Failed to register VFIO device");
return ret;
}
ret = vfio_configure_dev_irqs(kvm, device);
if (ret) {
pr_err("Failed to configure IRQs for device %s in group%lu",
dirent->d_name, group->id);
return ret;
}
pr_info("Assigned device %s in group %lu to device number 0x%x",
dirent->d_name, group->id, device->dev_hdr.dev_num);
list_add(&device->list, &group->devices);
}
if (closedir(dir))
pr_warning("Failed to close IOMMU group %s", dirpath);
}
return 0;
}
/* TODO: this should be an arch callback, so arm can return HYP only if vsmmu */
#define VFIO_TYPE1_NESTING_IOMMU 6
static int vfio_get_iommu_type(void)
{
if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_NESTING_IOMMU))
return VFIO_TYPE1_NESTING_IOMMU;
if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
return VFIO_TYPE1v2_IOMMU;
if (ioctl(vfio_container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU))
return VFIO_TYPE1_IOMMU;
return -ENODEV;
}
#define VFIO_PATH_MAX_LEN 16
static int vfio_container_init(struct kvm *kvm) {
int api, i, ret, iommu_type;;
struct vfio_iommu_type1_dma_map dma_map = {
.argsz = sizeof(dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
.vaddr = (unsigned long)kvm->ram_start,
.iova = host_to_guest_flat(kvm, kvm->ram_start),
.size = kvm->ram_size,
};
/* Create a container for our IOMMU groups */
vfio_container = open(VFIO_DEV_NODE, O_RDWR);
if (vfio_container == -1) {
ret = errno;
pr_err("Failed to open %s", VFIO_DEV_NODE);
return ret;
}
api = ioctl(vfio_container, VFIO_GET_API_VERSION);
if (api != VFIO_API_VERSION) {
pr_err("Unknown VFIO API version %d", api);
return -ENODEV;
}
iommu_type = vfio_get_iommu_type();
if (iommu_type < 0) {
pr_err("VFIO type-1 IOMMU not supported on this platform");
return iommu_type;
}
/* Sanity check our groups and add them to the container */
for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
char group_node[VFIO_PATH_MAX_LEN];
struct vfio_group *group = &kvm->cfg.vfio_group[i];
struct vfio_group_status group_status = {
.argsz = sizeof(group_status),
};
snprintf(group_node, VFIO_PATH_MAX_LEN, VFIO_DEV_DIR "/%lu",
group->id);
group->fd = open(group_node, O_RDWR);
if (group->fd == -1) {
ret = -errno;
pr_err("Failed to open IOMMU group %s", group_node);
return ret;
}
if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) {
ret = -errno;
pr_err("Failed to determine status of IOMMU group %s",
group_node);
return ret;
}
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
pr_err("IOMMU group %s is not viable", group_node);
return -EINVAL;
}
if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &vfio_container)) {
ret = -errno;
pr_err("Failed to add IOMMU group %s to VFIO container",
group_node);
return ret;
}
}
/* Finalise the container */
if (ioctl(vfio_container, VFIO_SET_IOMMU, iommu_type)) {
ret = -errno;
pr_err("Failed to set IOMMU type %d for VFIO container",
iommu_type);
return ret;
} else {
pr_info("Using IOMMU type %d for VFIO container",
iommu_type);
}
/* Map the guest memory for DMA (i.e. provide isolation) */
if (ioctl(vfio_container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
ret = -errno;
pr_err("Failed to map guest memory for DMA");
return ret;
}
return 0;
}
static int vfio__init(struct kvm *kvm)
{
int ret;
if (!kvm->cfg.num_vfio_groups)
return 0;
ret = vfio_container_init(kvm);
if (ret)
return ret;
ret = vfio_configure_iommu_groups(kvm);
if (ret)
return ret;
return 0;
}
dev_base_init(vfio__init);
static int vfio__exit(struct kvm *kvm)
{
int i, fd;
struct vfio_iommu_type1_dma_unmap dma_unmap = {
.argsz = sizeof(dma_unmap),
.size = kvm->ram_size,
.iova = host_to_guest_flat(kvm, kvm->ram_start),
};
if (!kvm->cfg.num_vfio_groups)
return 0;
for (i = 0; i < kvm->cfg.num_vfio_groups; ++i) {
fd = kvm->cfg.vfio_group[i].fd;
ioctl(fd, VFIO_GROUP_UNSET_CONTAINER, &vfio_container);
close(fd);
}
ioctl(vfio_container, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
return close(vfio_container);
}
dev_base_exit(vfio__exit);