| #include "linux/sizes.h" |
| |
| #include "kvm/irq.h" |
| #include "kvm/kvm.h" |
| #include "kvm/kvm-cpu.h" |
| #include "kvm/vfio.h" |
| |
| #include <assert.h> |
| |
| #include <sys/ioctl.h> |
| #include <sys/eventfd.h> |
| #include <sys/resource.h> |
| #include <sys/time.h> |
| |
| /* Some distros don't have the define. */ |
| #ifndef PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 |
| #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 |
| #endif |
| |
| /* Wrapper around UAPI vfio_irq_set */ |
| union vfio_irq_eventfd { |
| struct vfio_irq_set irq; |
| u8 buffer[sizeof(struct vfio_irq_set) + sizeof(int)]; |
| }; |
| |
| static void set_vfio_irq_eventd_payload(union vfio_irq_eventfd *evfd, int fd) |
| { |
| memcpy(&evfd->irq.data, &fd, sizeof(fd)); |
| } |
| |
| /* |
| * To support MSI and MSI-X with common code, track the host and guest states of |
| * the MSI/MSI-X capability, and of individual vectors. |
| * |
| * Both MSI and MSI-X capabilities are enabled and disabled through registers. |
| * Vectors cannot be individually disabled. |
| */ |
| #define msi_is_enabled(state) ((state) & VFIO_PCI_MSI_STATE_ENABLED) |
| |
| /* |
| * MSI-X: the control register allows to mask all vectors, and the table allows |
| * to mask each vector individually. |
| * |
| * MSI: if the capability supports Per-Vector Masking then the Mask Bit register |
| * allows to mask each vector individually. Otherwise there is no masking for |
| * MSI. |
| */ |
| #define msi_is_masked(state) ((state) & VFIO_PCI_MSI_STATE_MASKED) |
| |
| /* |
| * A capability is empty when no vector has been registered with SET_IRQS |
| * yet. It's an optimization specific to kvmtool to avoid issuing lots of |
| * SET_IRQS ioctls when the guest configures the MSI-X table while the |
| * capability is masked. |
| */ |
| #define msi_is_empty(state) ((state) & VFIO_PCI_MSI_STATE_EMPTY) |
| |
| #define msi_update_state(state, val, bit) \ |
| (state) = (val) ? (state) | bit : (state) & ~bit; |
| #define msi_set_enabled(state, val) \ |
| msi_update_state(state, val, VFIO_PCI_MSI_STATE_ENABLED) |
| #define msi_set_masked(state, val) \ |
| msi_update_state(state, val, VFIO_PCI_MSI_STATE_MASKED) |
| #define msi_set_empty(state, val) \ |
| msi_update_state(state, val, VFIO_PCI_MSI_STATE_EMPTY) |
| |
| static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev); |
| static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev); |
| |
| static int vfio_pci_enable_msis(struct kvm *kvm, struct vfio_device *vdev, |
| bool msix) |
| { |
| size_t i; |
| int ret = 0; |
| int *eventfds; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; |
| union vfio_irq_eventfd single = { |
| .irq = { |
| .argsz = sizeof(single), |
| .flags = VFIO_IRQ_SET_DATA_EVENTFD | |
| VFIO_IRQ_SET_ACTION_TRIGGER, |
| .index = msis->info.index, |
| .count = 1, |
| }, |
| }; |
| |
| if (!msi_is_enabled(msis->guest_state)) |
| return 0; |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) |
| /* |
| * PCI (and VFIO) forbids enabling INTx, MSI or MSIX at the same |
| * time. Since INTx has to be enabled from the start (we don't |
| * have a reliable way to know when the guest starts using it), |
| * disable it now. |
| */ |
| vfio_pci_disable_intx(kvm, vdev); |
| |
| eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); |
| |
| /* |
| * Initial registration of the full range. This enables the physical |
| * MSI/MSI-X capability, which might have side effects. For instance |
| * when assigning virtio legacy devices, enabling the MSI capability |
| * modifies the config space layout! |
| * |
| * As an optimization, only update MSIs when guest unmasks the |
| * capability. This greatly reduces the initialization time for Linux |
| * guest with 2048+ MSIs. Linux guest starts by enabling the MSI-X cap |
| * masked, then fills individual vectors, then unmasks the whole |
| * function. So we only do one VFIO ioctl when enabling for the first |
| * time, and then one when unmasking. |
| */ |
| if (!msi_is_enabled(msis->host_state) || |
| (!msi_is_masked(msis->guest_state) && |
| msi_is_empty(msis->host_state))) { |
| bool empty = true; |
| |
| for (i = 0; i < msis->nr_entries; i++) { |
| eventfds[i] = msis->entries[i].gsi >= 0 ? |
| msis->entries[i].eventfd : -1; |
| |
| if (eventfds[i] >= 0) |
| empty = false; |
| } |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, msis->irq_set); |
| if (ret < 0) { |
| perror("VFIO_DEVICE_SET_IRQS(multi)"); |
| return ret; |
| } |
| |
| msi_set_enabled(msis->host_state, true); |
| msi_set_empty(msis->host_state, empty); |
| |
| return 0; |
| } |
| |
| if (msi_is_masked(msis->guest_state)) { |
| /* TODO: if host_state is not empty nor masked, mask all vectors */ |
| return 0; |
| } |
| |
| /* Update individual vectors to avoid breaking those in use */ |
| for (i = 0; i < msis->nr_entries; i++) { |
| struct vfio_pci_msi_entry *entry = &msis->entries[i]; |
| int fd = entry->gsi >= 0 ? entry->eventfd : -1; |
| |
| if (fd == eventfds[i]) |
| continue; |
| |
| single.irq.start = i; |
| set_vfio_irq_eventd_payload(&single, fd); |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &single); |
| if (ret < 0) { |
| perror("VFIO_DEVICE_SET_IRQS(single)"); |
| break; |
| } |
| |
| eventfds[i] = fd; |
| |
| if (msi_is_empty(msis->host_state) && fd >= 0) |
| msi_set_empty(msis->host_state, false); |
| } |
| |
| return ret; |
| } |
| |
| static int vfio_pci_disable_msis(struct kvm *kvm, struct vfio_device *vdev, |
| bool msix) |
| { |
| int ret; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_pci_msi_common *msis = msix ? &pdev->msix : &pdev->msi; |
| struct vfio_irq_set irq_set = { |
| .argsz = sizeof(irq_set), |
| .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, |
| .index = msis->info.index, |
| .start = 0, |
| .count = 0, |
| }; |
| |
| if (!msi_is_enabled(msis->host_state)) |
| return 0; |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); |
| if (ret < 0) { |
| perror("VFIO_DEVICE_SET_IRQS(NONE)"); |
| return ret; |
| } |
| |
| msi_set_enabled(msis->host_state, false); |
| msi_set_empty(msis->host_state, true); |
| |
| /* |
| * When MSI or MSIX is disabled, this might be called when |
| * PCI driver detects the MSI interrupt failure and wants to |
| * rollback to INTx mode. Thus enable INTx if the device |
| * supports INTx mode in this case. |
| */ |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) |
| ret = vfio_pci_enable_intx(kvm, vdev); |
| |
| return ret >= 0 ? 0 : ret; |
| } |
| |
| static int vfio_pci_update_msi_entry(struct kvm *kvm, struct vfio_device *vdev, |
| struct vfio_pci_msi_entry *entry) |
| { |
| int ret; |
| |
| if (entry->eventfd < 0) { |
| entry->eventfd = eventfd(0, 0); |
| if (entry->eventfd < 0) { |
| ret = -errno; |
| vfio_dev_err(vdev, "cannot create eventfd"); |
| return ret; |
| } |
| } |
| |
| /* Allocate IRQ if necessary */ |
| if (entry->gsi < 0) { |
| int ret = irq__add_msix_route(kvm, &entry->config.msg, |
| vdev->dev_hdr.dev_num << 3); |
| if (ret < 0) { |
| vfio_dev_err(vdev, "cannot create MSI-X route"); |
| return ret; |
| } |
| entry->gsi = ret; |
| } else { |
| irq__update_msix_route(kvm, entry->gsi, &entry->config.msg); |
| } |
| |
| /* |
| * MSI masking is unimplemented in VFIO, so we have to handle it by |
| * disabling/enabling IRQ route instead. We do it on the KVM side rather |
| * than VFIO, because: |
| * - it is 8x faster |
| * - it allows to decouple masking logic from capability state. |
| * - in masked state, after removing irqfd route, we could easily plug |
| * the eventfd in a local handler, in order to serve Pending Bit reads |
| * to the guest. |
| * |
| * So entry->host_state is masked when there is no active irqfd route. |
| */ |
| if (msi_is_masked(entry->guest_state) == msi_is_masked(entry->host_state)) |
| return 0; |
| |
| if (msi_is_masked(entry->host_state)) { |
| ret = irq__add_irqfd(kvm, entry->gsi, entry->eventfd, -1); |
| if (ret < 0) { |
| vfio_dev_err(vdev, "cannot setup irqfd"); |
| return ret; |
| } |
| } else { |
| irq__del_irqfd(kvm, entry->gsi, entry->eventfd); |
| } |
| |
| msi_set_masked(entry->host_state, msi_is_masked(entry->guest_state)); |
| |
| return 0; |
| } |
| |
| static void vfio_pci_msix_pba_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, |
| u32 len, u8 is_write, void *ptr) |
| { |
| struct vfio_pci_device *pdev = ptr; |
| struct vfio_pci_msix_pba *pba = &pdev->msix_pba; |
| u64 offset = addr - pba->guest_phys_addr; |
| struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); |
| |
| if (offset >= pba->size) { |
| vfio_dev_err(vdev, "access outside of the MSIX PBA"); |
| return; |
| } |
| |
| if (is_write) |
| return; |
| |
| /* |
| * TODO: emulate PBA. Hardware MSI-X is never masked, so reading the PBA |
| * is completely useless here. Note that Linux doesn't use PBA. |
| */ |
| if (pread(vdev->fd, data, len, pba->fd_offset + offset) != (ssize_t)len) |
| vfio_dev_err(vdev, "cannot access MSIX PBA\n"); |
| } |
| |
| static void vfio_pci_msix_table_access(struct kvm_cpu *vcpu, u64 addr, u8 *data, |
| u32 len, u8 is_write, void *ptr) |
| { |
| struct kvm *kvm = vcpu->kvm; |
| struct vfio_pci_msi_entry *entry; |
| struct vfio_pci_device *pdev = ptr; |
| struct vfio_device *vdev = container_of(pdev, struct vfio_device, pci); |
| |
| u64 offset = addr - pdev->msix_table.guest_phys_addr; |
| if (offset >= pdev->msix_table.size) { |
| vfio_dev_err(vdev, "access outside of the MSI-X table"); |
| return; |
| } |
| |
| size_t vector = offset / PCI_MSIX_ENTRY_SIZE; |
| off_t field = offset % PCI_MSIX_ENTRY_SIZE; |
| |
| /* |
| * PCI spec says that software must use aligned 4 or 8 bytes accesses |
| * for the MSI-X tables. |
| */ |
| if ((len != 4 && len != 8) || addr & (len - 1)) { |
| vfio_dev_warn(vdev, "invalid MSI-X table access"); |
| return; |
| } |
| |
| entry = &pdev->msix.entries[vector]; |
| |
| mutex_lock(&pdev->msix.mutex); |
| |
| if (!is_write) { |
| memcpy(data, (void *)&entry->config + field, len); |
| goto out_unlock; |
| } |
| |
| memcpy((void *)&entry->config + field, data, len); |
| |
| /* |
| * Check if access touched the vector control register, which is at the |
| * end of the MSI-X entry. |
| */ |
| if (field + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) |
| goto out_unlock; |
| |
| msi_set_masked(entry->guest_state, entry->config.ctrl & |
| PCI_MSIX_ENTRY_CTRL_MASKBIT); |
| |
| if (vfio_pci_update_msi_entry(kvm, vdev, entry) < 0) |
| /* Not much we can do here. */ |
| vfio_dev_err(vdev, "failed to configure MSIX vector %zu", vector); |
| |
| /* Update the physical capability if necessary */ |
| if (vfio_pci_enable_msis(kvm, vdev, true)) |
| vfio_dev_err(vdev, "cannot enable MSIX"); |
| |
| out_unlock: |
| mutex_unlock(&pdev->msix.mutex); |
| } |
| |
| static void vfio_pci_msix_cap_write(struct kvm *kvm, |
| struct vfio_device *vdev, u16 off, |
| void *data, int sz) |
| { |
| struct vfio_pci_device *pdev = &vdev->pci; |
| off_t enable_pos = PCI_MSIX_FLAGS + 1; |
| bool enable; |
| u16 flags; |
| |
| off -= pdev->msix.pos; |
| |
| /* Check if access intersects with the MSI-X Enable bit */ |
| if (off > enable_pos || off + sz <= enable_pos) |
| return; |
| |
| /* Read byte that contains the Enable bit */ |
| flags = *(u8 *)(data + enable_pos - off) << 8; |
| |
| mutex_lock(&pdev->msix.mutex); |
| |
| msi_set_masked(pdev->msix.guest_state, flags & PCI_MSIX_FLAGS_MASKALL); |
| enable = flags & PCI_MSIX_FLAGS_ENABLE; |
| msi_set_enabled(pdev->msix.guest_state, enable); |
| |
| if (enable && vfio_pci_enable_msis(kvm, vdev, true)) |
| vfio_dev_err(vdev, "cannot enable MSIX"); |
| else if (!enable && vfio_pci_disable_msis(kvm, vdev, true)) |
| vfio_dev_err(vdev, "cannot disable MSIX"); |
| |
| mutex_unlock(&pdev->msix.mutex); |
| } |
| |
| static int vfio_pci_msi_vector_write(struct kvm *kvm, struct vfio_device *vdev, |
| u16 off, u8 *data, u32 sz) |
| { |
| size_t i; |
| u32 mask = 0; |
| size_t mask_pos, start, limit; |
| struct vfio_pci_msi_entry *entry; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); |
| |
| if (!(msi_cap_64->ctrl & PCI_MSI_FLAGS_MASKBIT)) |
| return 0; |
| |
| if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) |
| mask_pos = PCI_MSI_MASK_64; |
| else |
| mask_pos = PCI_MSI_MASK_32; |
| |
| if (off >= mask_pos + 4 || off + sz <= mask_pos) |
| return 0; |
| |
| /* Set mask to current state */ |
| for (i = 0; i < pdev->msi.nr_entries; i++) { |
| entry = &pdev->msi.entries[i]; |
| mask |= !!msi_is_masked(entry->guest_state) << i; |
| } |
| |
| /* Update mask following the intersection of access and register */ |
| start = max_t(size_t, off, mask_pos); |
| limit = min_t(size_t, off + sz, mask_pos + 4); |
| |
| memcpy((void *)&mask + start - mask_pos, data + start - off, |
| limit - start); |
| |
| /* Update states if necessary */ |
| for (i = 0; i < pdev->msi.nr_entries; i++) { |
| bool masked = mask & (1 << i); |
| |
| entry = &pdev->msi.entries[i]; |
| if (masked != msi_is_masked(entry->guest_state)) { |
| msi_set_masked(entry->guest_state, masked); |
| vfio_pci_update_msi_entry(kvm, vdev, entry); |
| } |
| } |
| |
| return 1; |
| } |
| |
| static void vfio_pci_msi_cap_write(struct kvm *kvm, struct vfio_device *vdev, |
| u16 off, u8 *data, u32 sz) |
| { |
| u8 ctrl; |
| struct msi_msg msg; |
| size_t i, nr_vectors; |
| struct vfio_pci_msi_entry *entry; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct msi_cap_64 *msi_cap_64 = PCI_CAP(&pdev->hdr, pdev->msi.pos); |
| |
| off -= pdev->msi.pos; |
| |
| mutex_lock(&pdev->msi.mutex); |
| |
| /* Check if the guest is trying to update mask bits */ |
| if (vfio_pci_msi_vector_write(kvm, vdev, off, data, sz)) |
| goto out_unlock; |
| |
| /* Only modify routes when guest pokes the enable bit */ |
| if (off > PCI_MSI_FLAGS || off + sz <= PCI_MSI_FLAGS) |
| goto out_unlock; |
| |
| ctrl = *(u8 *)(data + PCI_MSI_FLAGS - off); |
| |
| msi_set_enabled(pdev->msi.guest_state, ctrl & PCI_MSI_FLAGS_ENABLE); |
| |
| if (!msi_is_enabled(pdev->msi.guest_state)) { |
| vfio_pci_disable_msis(kvm, vdev, false); |
| goto out_unlock; |
| } |
| |
| /* Create routes for the requested vectors */ |
| nr_vectors = 1 << ((ctrl & PCI_MSI_FLAGS_QSIZE) >> 4); |
| |
| msg.address_lo = msi_cap_64->address_lo; |
| if (msi_cap_64->ctrl & PCI_MSI_FLAGS_64BIT) { |
| msg.address_hi = msi_cap_64->address_hi; |
| msg.data = msi_cap_64->data; |
| } else { |
| struct msi_cap_32 *msi_cap_32 = (void *)msi_cap_64; |
| msg.address_hi = 0; |
| msg.data = msi_cap_32->data; |
| } |
| |
| for (i = 0; i < nr_vectors; i++) { |
| entry = &pdev->msi.entries[i]; |
| |
| /* |
| * Set the MSI data value as required by the PCI local |
| * bus specifications, MSI capability, "Message Data". |
| */ |
| msg.data &= ~(nr_vectors - 1); |
| msg.data |= i; |
| |
| entry->config.msg = msg; |
| vfio_pci_update_msi_entry(kvm, vdev, entry); |
| } |
| |
| /* Update the physical capability if necessary */ |
| if (vfio_pci_enable_msis(kvm, vdev, false)) |
| vfio_dev_err(vdev, "cannot enable MSI"); |
| |
| out_unlock: |
| mutex_unlock(&pdev->msi.mutex); |
| } |
| |
| static int vfio_pci_bar_activate(struct kvm *kvm, |
| struct pci_device_header *pci_hdr, |
| int bar_num, void *data) |
| { |
| struct vfio_device *vdev = data; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_pci_msix_pba *pba = &pdev->msix_pba; |
| struct vfio_pci_msix_table *table = &pdev->msix_table; |
| struct vfio_region *region; |
| u32 bar_addr; |
| bool has_msix; |
| int ret; |
| |
| assert((u32)bar_num < vdev->info.num_regions); |
| |
| region = &vdev->regions[bar_num]; |
| has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; |
| |
| bar_addr = pci__bar_address(pci_hdr, bar_num); |
| if (pci__bar_is_io(pci_hdr, bar_num)) |
| region->port_base = bar_addr; |
| else |
| region->guest_phys_addr = bar_addr; |
| |
| if (has_msix && (u32)bar_num == table->bar) { |
| table->guest_phys_addr = region->guest_phys_addr; |
| ret = kvm__register_mmio(kvm, table->guest_phys_addr, |
| table->size, false, |
| vfio_pci_msix_table_access, pdev); |
| /* |
| * The MSIX table and the PBA structure can share the same BAR, |
| * but for convenience we register different regions for mmio |
| * emulation. We want to we update both if they share the same |
| * BAR. |
| */ |
| if (ret < 0 || table->bar != pba->bar) |
| goto out; |
| } |
| |
| if (has_msix && (u32)bar_num == pba->bar) { |
| if (pba->bar == table->bar) |
| pba->guest_phys_addr = table->guest_phys_addr + pba->bar_offset; |
| else |
| pba->guest_phys_addr = region->guest_phys_addr; |
| ret = kvm__register_mmio(kvm, pba->guest_phys_addr, |
| pba->size, false, |
| vfio_pci_msix_pba_access, pdev); |
| goto out; |
| } |
| |
| ret = vfio_map_region(kvm, vdev, region); |
| out: |
| return ret; |
| } |
| |
| static int vfio_pci_bar_deactivate(struct kvm *kvm, |
| struct pci_device_header *pci_hdr, |
| int bar_num, void *data) |
| { |
| struct vfio_device *vdev = data; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_pci_msix_pba *pba = &pdev->msix_pba; |
| struct vfio_pci_msix_table *table = &pdev->msix_table; |
| struct vfio_region *region; |
| bool has_msix, success; |
| int ret; |
| |
| assert((u32)bar_num < vdev->info.num_regions); |
| |
| region = &vdev->regions[bar_num]; |
| has_msix = pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX; |
| |
| if (has_msix && (u32)bar_num == table->bar) { |
| success = kvm__deregister_mmio(kvm, table->guest_phys_addr); |
| /* kvm__deregister_mmio fails when the region is not found. */ |
| ret = (success ? 0 : -ENOENT); |
| /* See vfio_pci_bar_activate(). */ |
| if (ret < 0 || table->bar!= pba->bar) |
| goto out; |
| } |
| |
| if (has_msix && (u32)bar_num == pba->bar) { |
| success = kvm__deregister_mmio(kvm, pba->guest_phys_addr); |
| ret = (success ? 0 : -ENOENT); |
| goto out; |
| } |
| |
| vfio_unmap_region(kvm, region); |
| ret = 0; |
| |
| out: |
| return ret; |
| } |
| |
| static void vfio_pci_cfg_read(struct kvm *kvm, struct pci_device_header *pci_hdr, |
| u16 offset, void *data, int sz) |
| { |
| struct vfio_region_info *info; |
| struct vfio_pci_device *pdev; |
| struct vfio_device *vdev; |
| char base[sz]; |
| |
| pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); |
| vdev = container_of(pdev, struct vfio_device, pci); |
| info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; |
| |
| /* Dummy read in case of side-effects */ |
| if (pread(vdev->fd, base, sz, info->offset + offset) != sz) |
| vfio_dev_warn(vdev, "failed to read %d bytes from Configuration Space at 0x%x", |
| sz, offset); |
| } |
| |
| static void vfio_pci_cfg_write(struct kvm *kvm, struct pci_device_header *pci_hdr, |
| u16 offset, void *data, int sz) |
| { |
| struct vfio_region_info *info; |
| struct vfio_pci_device *pdev; |
| struct vfio_device *vdev; |
| u32 tmp; |
| |
| /* Make sure a larger size will not overrun tmp on the stack. */ |
| assert(sz <= 4); |
| |
| if (offset == PCI_ROM_ADDRESS) |
| return; |
| |
| pdev = container_of(pci_hdr, struct vfio_pci_device, hdr); |
| vdev = container_of(pdev, struct vfio_device, pci); |
| info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; |
| |
| if (pwrite(vdev->fd, data, sz, info->offset + offset) != sz) |
| vfio_dev_warn(vdev, "Failed to write %d bytes to Configuration Space at 0x%x", |
| sz, offset); |
| |
| /* Handle MSI write now, since it might update the hardware capability */ |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) |
| vfio_pci_msix_cap_write(kvm, vdev, offset, data, sz); |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) |
| vfio_pci_msi_cap_write(kvm, vdev, offset, data, sz); |
| |
| if (pread(vdev->fd, &tmp, sz, info->offset + offset) != sz) |
| vfio_dev_warn(vdev, "Failed to read %d bytes from Configuration Space at 0x%x", |
| sz, offset); |
| } |
| |
| static ssize_t vfio_pci_msi_cap_size(struct msi_cap_64 *cap_hdr) |
| { |
| size_t size = 10; |
| |
| if (cap_hdr->ctrl & PCI_MSI_FLAGS_64BIT) |
| size += 4; |
| if (cap_hdr->ctrl & PCI_MSI_FLAGS_MASKBIT) |
| size += 10; |
| |
| return size; |
| } |
| |
| static ssize_t vfio_pci_cap_size(struct pci_cap_hdr *cap_hdr) |
| { |
| switch (cap_hdr->type) { |
| case PCI_CAP_ID_MSIX: |
| return PCI_CAP_MSIX_SIZEOF; |
| case PCI_CAP_ID_MSI: |
| return vfio_pci_msi_cap_size((void *)cap_hdr); |
| case PCI_CAP_ID_EXP: |
| /* |
| * We don't emulate any of the link, slot and root complex |
| * properties, so ignore them. |
| */ |
| return PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1; |
| default: |
| pr_err("unknown PCI capability 0x%x", cap_hdr->type); |
| return 0; |
| } |
| } |
| |
| static int vfio_pci_add_cap(struct vfio_device *vdev, u8 *virt_hdr, |
| struct pci_cap_hdr *cap, off_t pos) |
| { |
| struct pci_cap_hdr *last; |
| struct pci_device_header *hdr = &vdev->pci.hdr; |
| |
| cap->next = 0; |
| |
| if (!hdr->capabilities) { |
| hdr->capabilities = pos; |
| hdr->status |= PCI_STATUS_CAP_LIST; |
| } else { |
| last = PCI_CAP(virt_hdr, hdr->capabilities); |
| |
| while (last->next) |
| last = PCI_CAP(virt_hdr, last->next); |
| |
| last->next = pos; |
| } |
| |
| memcpy(virt_hdr + pos, cap, vfio_pci_cap_size(cap)); |
| |
| return 0; |
| } |
| |
| static int vfio_pci_parse_caps(struct vfio_device *vdev) |
| { |
| int ret; |
| size_t size; |
| u16 pos, next; |
| struct pci_cap_hdr *cap; |
| u8 virt_hdr[PCI_DEV_CFG_SIZE_LEGACY]; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| |
| if (!(pdev->hdr.status & PCI_STATUS_CAP_LIST)) |
| return 0; |
| |
| memset(virt_hdr, 0, PCI_DEV_CFG_SIZE_LEGACY); |
| |
| pos = pdev->hdr.capabilities & ~3; |
| |
| pdev->hdr.status &= ~PCI_STATUS_CAP_LIST; |
| pdev->hdr.capabilities = 0; |
| |
| for (; pos; pos = next) { |
| cap = PCI_CAP(&pdev->hdr, pos); |
| next = cap->next; |
| |
| switch (cap->type) { |
| case PCI_CAP_ID_MSIX: |
| ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); |
| if (ret) |
| return ret; |
| |
| pdev->msix.pos = pos; |
| pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSIX; |
| break; |
| case PCI_CAP_ID_MSI: |
| ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); |
| if (ret) |
| return ret; |
| |
| pdev->msi.pos = pos; |
| pdev->irq_modes |= VFIO_PCI_IRQ_MODE_MSI; |
| break; |
| case PCI_CAP_ID_EXP: |
| if (!arch_has_pci_exp()) |
| continue; |
| ret = vfio_pci_add_cap(vdev, virt_hdr, cap, pos); |
| if (ret) |
| return ret; |
| break; |
| } |
| } |
| |
| /* Wipe remaining capabilities */ |
| pos = PCI_STD_HEADER_SIZEOF; |
| size = PCI_DEV_CFG_SIZE_LEGACY - PCI_STD_HEADER_SIZEOF; |
| memcpy((void *)&pdev->hdr + pos, virt_hdr + pos, size); |
| |
| return 0; |
| } |
| |
| static int vfio_pci_parse_cfg_space(struct vfio_device *vdev) |
| { |
| ssize_t sz = PCI_DEV_CFG_SIZE_LEGACY; |
| struct vfio_region_info *info; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| |
| if (vdev->info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { |
| vfio_dev_err(vdev, "Config Space not found"); |
| return -ENODEV; |
| } |
| |
| info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; |
| *info = (struct vfio_region_info) { |
| .argsz = sizeof(*info), |
| .index = VFIO_PCI_CONFIG_REGION_INDEX, |
| }; |
| |
| ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); |
| if (!info->size) { |
| vfio_dev_err(vdev, "Config Space has size zero?!"); |
| return -EINVAL; |
| } |
| |
| /* Read standard headers and capabilities */ |
| if (pread(vdev->fd, &pdev->hdr, sz, info->offset) != sz) { |
| vfio_dev_err(vdev, "failed to read %zd bytes of Config Space", sz); |
| return -EIO; |
| } |
| |
| /* Strip bit 7, that indicates multifunction */ |
| pdev->hdr.header_type &= 0x7f; |
| |
| if (pdev->hdr.header_type != PCI_HEADER_TYPE_NORMAL) { |
| vfio_dev_err(vdev, "unsupported header type %u", |
| pdev->hdr.header_type); |
| return -EOPNOTSUPP; |
| } |
| |
| if (pdev->hdr.irq_pin) |
| pdev->irq_modes |= VFIO_PCI_IRQ_MODE_INTX; |
| |
| vfio_pci_parse_caps(vdev); |
| |
| return 0; |
| } |
| |
| static int vfio_pci_fixup_cfg_space(struct vfio_device *vdev) |
| { |
| int i; |
| u64 base; |
| ssize_t hdr_sz; |
| struct msix_cap *msix; |
| struct vfio_region_info *info; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_region *region; |
| |
| /* Initialise the BARs */ |
| for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { |
| if ((u32)i == vdev->info.num_regions) |
| break; |
| |
| region = &vdev->regions[i]; |
| /* Construct a fake reg to match what we've mapped. */ |
| if (region->is_ioport) { |
| base = (region->port_base & PCI_BASE_ADDRESS_IO_MASK) | |
| PCI_BASE_ADDRESS_SPACE_IO; |
| } else { |
| base = (region->guest_phys_addr & |
| PCI_BASE_ADDRESS_MEM_MASK) | |
| PCI_BASE_ADDRESS_SPACE_MEMORY; |
| } |
| |
| pdev->hdr.bar[i] = base; |
| |
| if (!base) |
| continue; |
| |
| pdev->hdr.bar_size[i] = region->info.size; |
| } |
| |
| /* I really can't be bothered to support cardbus. */ |
| pdev->hdr.card_bus = 0; |
| |
| /* |
| * Nuke the expansion ROM for now. If we want to do this properly, |
| * we need to save its size somewhere and map into the guest. |
| */ |
| pdev->hdr.exp_rom_bar = 0; |
| |
| /* Plumb in our fake MSI-X capability, if we have it. */ |
| msix = pci_find_cap(&pdev->hdr, PCI_CAP_ID_MSIX); |
| if (msix) { |
| /* Add a shortcut to the PBA region for the MMIO handler */ |
| int pba_index = VFIO_PCI_BAR0_REGION_INDEX + pdev->msix_pba.bar; |
| u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; |
| |
| pdev->msix_pba.fd_offset = vdev->regions[pba_index].info.offset + |
| pba_bar_offset; |
| |
| /* Tidy up the capability */ |
| msix->table_offset &= PCI_MSIX_TABLE_BIR; |
| if (pdev->msix_table.bar == pdev->msix_pba.bar) { |
| /* Keep the same offset as the MSIX cap. */ |
| pdev->msix_pba.bar_offset = pba_bar_offset; |
| } else { |
| /* PBA is at the start of the BAR. */ |
| msix->pba_offset &= PCI_MSIX_PBA_BIR; |
| pdev->msix_pba.bar_offset = 0; |
| } |
| } |
| |
| /* Install our fake Configuration Space */ |
| info = &vdev->regions[VFIO_PCI_CONFIG_REGION_INDEX].info; |
| /* |
| * We don't touch the extended configuration space, let's be cautious |
| * and not overwrite it all with zeros, or bad things might happen. |
| */ |
| hdr_sz = PCI_DEV_CFG_SIZE_LEGACY; |
| if (pwrite(vdev->fd, &pdev->hdr, hdr_sz, info->offset) != hdr_sz) { |
| vfio_dev_err(vdev, "failed to write %zd bytes to Config Space", |
| hdr_sz); |
| return -EIO; |
| } |
| |
| /* Register callbacks for cfg accesses */ |
| pdev->hdr.cfg_ops = (struct pci_config_operations) { |
| .read = vfio_pci_cfg_read, |
| .write = vfio_pci_cfg_write, |
| }; |
| |
| pdev->hdr.irq_type = IRQ_TYPE_LEVEL_HIGH; |
| |
| return 0; |
| } |
| |
| static int vfio_pci_get_region_info(struct vfio_device *vdev, u32 index, |
| struct vfio_region_info *info) |
| { |
| int ret; |
| |
| *info = (struct vfio_region_info) { |
| .argsz = sizeof(*info), |
| .index = index, |
| }; |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, info); |
| if (ret) { |
| ret = -errno; |
| vfio_dev_err(vdev, "cannot get info for BAR %u", index); |
| return ret; |
| } |
| |
| if (info->size && !is_power_of_two(info->size)) { |
| vfio_dev_err(vdev, "region is not power of two: 0x%llx", |
| info->size); |
| return -EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| static int vfio_pci_create_msix_table(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| int ret; |
| size_t i; |
| size_t map_size; |
| size_t nr_entries; |
| struct vfio_pci_msi_entry *entries; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_pci_msix_pba *pba = &pdev->msix_pba; |
| struct vfio_pci_msix_table *table = &pdev->msix_table; |
| struct msix_cap *msix = PCI_CAP(&pdev->hdr, pdev->msix.pos); |
| struct vfio_region_info info; |
| |
| table->bar = msix->table_offset & PCI_MSIX_TABLE_BIR; |
| pba->bar = msix->pba_offset & PCI_MSIX_TABLE_BIR; |
| |
| nr_entries = (msix->ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; |
| |
| /* MSIX table and PBA must support QWORD accesses. */ |
| table->size = ALIGN(nr_entries * PCI_MSIX_ENTRY_SIZE, 8); |
| pba->size = ALIGN(DIV_ROUND_UP(nr_entries, 64), 8); |
| |
| entries = calloc(nr_entries, sizeof(struct vfio_pci_msi_entry)); |
| if (!entries) |
| return -ENOMEM; |
| |
| for (i = 0; i < nr_entries; i++) |
| entries[i].config.ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; |
| |
| ret = vfio_pci_get_region_info(vdev, table->bar, &info); |
| if (ret) |
| return ret; |
| if (!info.size) |
| return -EINVAL; |
| |
| map_size = ALIGN(info.size, MAX_PAGE_SIZE); |
| table->guest_phys_addr = pci_get_mmio_block(map_size); |
| if (!table->guest_phys_addr) { |
| pr_err("cannot allocate MMIO space"); |
| ret = -ENOMEM; |
| goto out_free; |
| } |
| |
| /* |
| * We could map the physical PBA directly into the guest, but it's |
| * likely smaller than a page, and we can only hand full pages to the |
| * guest. Even though the PCI spec disallows sharing a page used for |
| * MSI-X with any other resource, it allows to share the same page |
| * between MSI-X table and PBA. For the sake of isolation, create a |
| * virtual PBA. |
| */ |
| if (table->bar == pba->bar) { |
| u32 pba_bar_offset = msix->pba_offset & PCI_MSIX_PBA_OFFSET; |
| |
| /* Sanity checks. */ |
| if (table->size > pba_bar_offset) |
| die("MSIX table overlaps with PBA"); |
| if (pba_bar_offset + pba->size > info.size) |
| die("PBA exceeds the size of the region"); |
| pba->guest_phys_addr = table->guest_phys_addr + pba_bar_offset; |
| } else { |
| ret = vfio_pci_get_region_info(vdev, pba->bar, &info); |
| if (ret) |
| return ret; |
| if (!info.size) |
| return -EINVAL; |
| |
| map_size = ALIGN(info.size, MAX_PAGE_SIZE); |
| pba->guest_phys_addr = pci_get_mmio_block(map_size); |
| if (!pba->guest_phys_addr) { |
| pr_err("cannot allocate MMIO space"); |
| ret = -ENOMEM; |
| goto out_free; |
| } |
| } |
| |
| pdev->msix.entries = entries; |
| pdev->msix.nr_entries = nr_entries; |
| |
| return 0; |
| |
| out_free: |
| free(entries); |
| |
| return ret; |
| } |
| |
| static int vfio_pci_create_msi_cap(struct kvm *kvm, struct vfio_pci_device *pdev) |
| { |
| struct msi_cap_64 *cap = PCI_CAP(&pdev->hdr, pdev->msi.pos); |
| |
| pdev->msi.nr_entries = 1 << ((cap->ctrl & PCI_MSI_FLAGS_QMASK) >> 1), |
| pdev->msi.entries = calloc(pdev->msi.nr_entries, |
| sizeof(struct vfio_pci_msi_entry)); |
| if (!pdev->msi.entries) |
| return -ENOMEM; |
| |
| return 0; |
| } |
| |
| static int vfio_pci_configure_bar(struct kvm *kvm, struct vfio_device *vdev, |
| size_t nr) |
| { |
| int ret; |
| u32 bar; |
| size_t map_size; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_region *region; |
| |
| if (nr >= vdev->info.num_regions) |
| return 0; |
| |
| region = &vdev->regions[nr]; |
| bar = pdev->hdr.bar[nr]; |
| |
| region->vdev = vdev; |
| region->is_ioport = !!(bar & PCI_BASE_ADDRESS_SPACE_IO); |
| |
| ret = vfio_pci_get_region_info(vdev, nr, ®ion->info); |
| if (ret) |
| return ret; |
| |
| /* Ignore invalid or unimplemented regions */ |
| if (!region->info.size) |
| return 0; |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { |
| /* Trap and emulate MSI-X table */ |
| if (nr == pdev->msix_table.bar) { |
| region->guest_phys_addr = pdev->msix_table.guest_phys_addr; |
| return 0; |
| } else if (nr == pdev->msix_pba.bar) { |
| region->guest_phys_addr = pdev->msix_pba.guest_phys_addr; |
| return 0; |
| } |
| } |
| |
| if (region->is_ioport) { |
| region->port_base = pci_get_io_port_block(region->info.size); |
| } else { |
| /* Grab some MMIO space in the guest */ |
| map_size = ALIGN(region->info.size, PAGE_SIZE); |
| region->guest_phys_addr = pci_get_mmio_block(map_size); |
| } |
| |
| return 0; |
| } |
| |
| static int vfio_pci_configure_dev_regions(struct kvm *kvm, |
| struct vfio_device *vdev) |
| { |
| int ret; |
| u32 bar; |
| size_t i; |
| bool is_64bit = false; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| |
| ret = vfio_pci_parse_cfg_space(vdev); |
| if (ret) |
| return ret; |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { |
| ret = vfio_pci_create_msix_table(kvm, vdev); |
| if (ret) |
| return ret; |
| } |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { |
| ret = vfio_pci_create_msi_cap(kvm, pdev); |
| if (ret) |
| return ret; |
| } |
| |
| for (i = VFIO_PCI_BAR0_REGION_INDEX; i <= VFIO_PCI_BAR5_REGION_INDEX; ++i) { |
| /* Ignore top half of 64-bit BAR */ |
| if (is_64bit) { |
| is_64bit = false; |
| continue; |
| } |
| |
| ret = vfio_pci_configure_bar(kvm, vdev, i); |
| if (ret) |
| return ret; |
| |
| bar = pdev->hdr.bar[i]; |
| is_64bit = (bar & PCI_BASE_ADDRESS_SPACE) == |
| PCI_BASE_ADDRESS_SPACE_MEMORY && |
| bar & PCI_BASE_ADDRESS_MEM_TYPE_64; |
| } |
| |
| /* We've configured the BARs, fake up a Configuration Space */ |
| ret = vfio_pci_fixup_cfg_space(vdev); |
| if (ret) |
| return ret; |
| |
| return pci__register_bar_regions(kvm, &pdev->hdr, vfio_pci_bar_activate, |
| vfio_pci_bar_deactivate, vdev); |
| } |
| |
| /* |
| * Attempt to update the FD limit, if opening an eventfd for each IRQ vector |
| * would hit the limit. Which is likely to happen when a device uses 2048 MSIs. |
| */ |
| static int vfio_pci_reserve_irq_fds(size_t num) |
| { |
| /* |
| * I counted around 27 fds under normal load. Let's add 100 for good |
| * measure. |
| */ |
| static size_t needed = 128; |
| struct rlimit fd_limit, new_limit; |
| |
| needed += num; |
| |
| if (getrlimit(RLIMIT_NOFILE, &fd_limit)) { |
| perror("getrlimit(RLIMIT_NOFILE)"); |
| return 0; |
| } |
| |
| if (fd_limit.rlim_cur >= needed) |
| return 0; |
| |
| new_limit.rlim_cur = needed; |
| |
| if (fd_limit.rlim_max < needed) |
| /* Try to bump hard limit (root only) */ |
| new_limit.rlim_max = needed; |
| else |
| new_limit.rlim_max = fd_limit.rlim_max; |
| |
| if (setrlimit(RLIMIT_NOFILE, &new_limit)) { |
| perror("setrlimit(RLIMIT_NOFILE)"); |
| pr_warning("not enough FDs for full MSI-X support (estimated need: %zu)", |
| (size_t)(needed - fd_limit.rlim_cur)); |
| } |
| |
| return 0; |
| } |
| |
| static int vfio_pci_init_msis(struct kvm *kvm, struct vfio_device *vdev, |
| struct vfio_pci_msi_common *msis) |
| { |
| int ret; |
| size_t i; |
| int *eventfds; |
| size_t irq_set_size; |
| struct vfio_pci_msi_entry *entry; |
| size_t nr_entries = msis->nr_entries; |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &msis->info); |
| if (ret || msis->info.count == 0) { |
| vfio_dev_err(vdev, "no MSI reported by VFIO"); |
| return -ENODEV; |
| } |
| |
| if (!(msis->info.flags & VFIO_IRQ_INFO_EVENTFD)) { |
| vfio_dev_err(vdev, "interrupt not EVENTFD capable"); |
| return -EINVAL; |
| } |
| |
| if (msis->info.count != nr_entries) { |
| vfio_dev_err(vdev, "invalid number of MSIs reported by VFIO"); |
| return -EINVAL; |
| } |
| |
| mutex_init(&msis->mutex); |
| |
| vfio_pci_reserve_irq_fds(nr_entries); |
| |
| irq_set_size = sizeof(struct vfio_irq_set) + nr_entries * sizeof(int); |
| msis->irq_set = malloc(irq_set_size); |
| if (!msis->irq_set) |
| return -ENOMEM; |
| |
| *msis->irq_set = (struct vfio_irq_set) { |
| .argsz = irq_set_size, |
| .flags = VFIO_IRQ_SET_DATA_EVENTFD | |
| VFIO_IRQ_SET_ACTION_TRIGGER, |
| .index = msis->info.index, |
| .start = 0, |
| .count = nr_entries, |
| }; |
| |
| eventfds = (void *)msis->irq_set + sizeof(struct vfio_irq_set); |
| |
| for (i = 0; i < nr_entries; i++) { |
| entry = &msis->entries[i]; |
| entry->gsi = -1; |
| entry->eventfd = -1; |
| msi_set_masked(entry->guest_state, false); |
| msi_set_masked(entry->host_state, true); |
| eventfds[i] = -1; |
| } |
| |
| return 0; |
| } |
| |
| static void vfio_pci_disable_intx(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| struct vfio_pci_device *pdev = &vdev->pci; |
| int gsi = pdev->intx_gsi; |
| struct vfio_irq_set irq_set = { |
| .argsz = sizeof(irq_set), |
| .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, |
| .index = VFIO_PCI_INTX_IRQ_INDEX, |
| }; |
| |
| if (pdev->intx_fd == -1) |
| return; |
| |
| pr_debug("user requested MSI, disabling INTx %d", gsi); |
| |
| ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); |
| irq__del_irqfd(kvm, gsi, pdev->intx_fd); |
| |
| close(pdev->intx_fd); |
| close(pdev->unmask_fd); |
| pdev->intx_fd = -1; |
| } |
| |
| static int vfio_pci_enable_intx(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| int ret; |
| int trigger_fd, unmask_fd; |
| union vfio_irq_eventfd trigger; |
| union vfio_irq_eventfd unmask; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| int gsi = pdev->intx_gsi; |
| |
| if (pdev->intx_fd != -1) |
| return 0; |
| |
| /* |
| * PCI IRQ is level-triggered, so we use two eventfds. trigger_fd |
| * signals an interrupt from host to guest, and unmask_fd signals the |
| * deassertion of the line from guest to host. |
| */ |
| trigger_fd = eventfd(0, 0); |
| if (trigger_fd < 0) { |
| vfio_dev_err(vdev, "failed to create trigger eventfd"); |
| return trigger_fd; |
| } |
| |
| unmask_fd = eventfd(0, 0); |
| if (unmask_fd < 0) { |
| vfio_dev_err(vdev, "failed to create unmask eventfd"); |
| close(trigger_fd); |
| return unmask_fd; |
| } |
| |
| ret = irq__add_irqfd(kvm, gsi, trigger_fd, unmask_fd); |
| if (ret) |
| goto err_close; |
| |
| trigger.irq = (struct vfio_irq_set) { |
| .argsz = sizeof(trigger), |
| .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, |
| .index = VFIO_PCI_INTX_IRQ_INDEX, |
| .start = 0, |
| .count = 1, |
| }; |
| set_vfio_irq_eventd_payload(&trigger, trigger_fd); |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); |
| if (ret < 0) { |
| vfio_dev_err(vdev, "failed to setup VFIO IRQ"); |
| goto err_delete_line; |
| } |
| |
| unmask.irq = (struct vfio_irq_set) { |
| .argsz = sizeof(unmask), |
| .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK, |
| .index = VFIO_PCI_INTX_IRQ_INDEX, |
| .start = 0, |
| .count = 1, |
| }; |
| set_vfio_irq_eventd_payload(&unmask, unmask_fd); |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &unmask); |
| if (ret < 0) { |
| vfio_dev_err(vdev, "failed to setup unmask IRQ"); |
| goto err_remove_event; |
| } |
| |
| pdev->intx_fd = trigger_fd; |
| pdev->unmask_fd = unmask_fd; |
| |
| return 0; |
| |
| err_remove_event: |
| /* Remove trigger event */ |
| trigger.irq.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; |
| trigger.irq.count = 0; |
| ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &trigger); |
| |
| err_delete_line: |
| irq__del_irqfd(kvm, gsi, trigger_fd); |
| |
| err_close: |
| close(trigger_fd); |
| close(unmask_fd); |
| return ret; |
| } |
| |
| static int vfio_pci_init_intx(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| int ret; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| struct vfio_irq_info irq_info = { |
| .argsz = sizeof(irq_info), |
| .index = VFIO_PCI_INTX_IRQ_INDEX, |
| }; |
| |
| vfio_pci_reserve_irq_fds(2); |
| |
| ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); |
| if (ret || irq_info.count == 0) { |
| vfio_dev_err(vdev, "no INTx reported by VFIO"); |
| return -ENODEV; |
| } |
| |
| if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { |
| vfio_dev_err(vdev, "interrupt not eventfd capable"); |
| return -EINVAL; |
| } |
| |
| if (!(irq_info.flags & VFIO_IRQ_INFO_AUTOMASKED)) { |
| vfio_dev_err(vdev, "INTx interrupt not AUTOMASKED"); |
| return -EINVAL; |
| } |
| |
| /* Guest is going to ovewrite our irq_line... */ |
| pdev->intx_gsi = pdev->hdr.irq_line - KVM_IRQ_OFFSET; |
| |
| pdev->intx_fd = -1; |
| |
| return 0; |
| } |
| |
| static int vfio_pci_configure_dev_irqs(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| int ret = 0; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSIX) { |
| pdev->msix.info = (struct vfio_irq_info) { |
| .argsz = sizeof(pdev->msix.info), |
| .index = VFIO_PCI_MSIX_IRQ_INDEX, |
| }; |
| ret = vfio_pci_init_msis(kvm, vdev, &pdev->msix); |
| if (ret) |
| return ret; |
| } |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_MSI) { |
| pdev->msi.info = (struct vfio_irq_info) { |
| .argsz = sizeof(pdev->msi.info), |
| .index = VFIO_PCI_MSI_IRQ_INDEX, |
| }; |
| ret = vfio_pci_init_msis(kvm, vdev, &pdev->msi); |
| if (ret) |
| return ret; |
| } |
| |
| if (pdev->irq_modes & VFIO_PCI_IRQ_MODE_INTX) { |
| pci__assign_irq(&vdev->pci.hdr); |
| |
| ret = vfio_pci_init_intx(kvm, vdev); |
| if (ret) |
| return ret; |
| |
| ret = vfio_pci_enable_intx(kvm, vdev); |
| } |
| |
| return ret; |
| } |
| |
| int vfio_pci_setup_device(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| int ret; |
| |
| ret = vfio_pci_configure_dev_regions(kvm, vdev); |
| if (ret) { |
| vfio_dev_err(vdev, "failed to configure regions"); |
| return ret; |
| } |
| |
| vdev->dev_hdr = (struct device_header) { |
| .bus_type = DEVICE_BUS_PCI, |
| .data = &vdev->pci.hdr, |
| }; |
| |
| ret = device__register(&vdev->dev_hdr); |
| if (ret) { |
| vfio_dev_err(vdev, "failed to register VFIO device"); |
| return ret; |
| } |
| |
| ret = vfio_pci_configure_dev_irqs(kvm, vdev); |
| if (ret) { |
| vfio_dev_err(vdev, "failed to configure IRQs"); |
| return ret; |
| } |
| |
| return 0; |
| } |
| |
| void vfio_pci_teardown_device(struct kvm *kvm, struct vfio_device *vdev) |
| { |
| size_t i; |
| struct vfio_pci_device *pdev = &vdev->pci; |
| |
| for (i = 0; i < vdev->info.num_regions; i++) |
| vfio_unmap_region(kvm, &vdev->regions[i]); |
| |
| device__unregister(&vdev->dev_hdr); |
| |
| free(pdev->msix.irq_set); |
| free(pdev->msix.entries); |
| free(pdev->msi.irq_set); |
| free(pdev->msi.entries); |
| } |