| // SPDX-License-Identifier: GPL-2.0-only |
| /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES |
| */ |
| #include <linux/file.h> |
| #include <linux/interval_tree.h> |
| #include <linux/iommu.h> |
| #include <linux/iommufd.h> |
| #include <linux/slab.h> |
| #include <linux/vfio.h> |
| #include <uapi/linux/vfio.h> |
| #include <uapi/linux/iommufd.h> |
| |
| #include "iommufd_private.h" |
| |
| static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) |
| { |
| struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); |
| |
| xa_lock(&ictx->objects); |
| if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) |
| goto out_unlock; |
| ioas = ictx->vfio_ioas; |
| out_unlock: |
| xa_unlock(&ictx->objects); |
| return ioas; |
| } |
| |
| /** |
| * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use |
| * @ictx: Context to operate on |
| * @out_ioas_id: The ioas_id the caller should use |
| * |
| * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate |
| * on since they do not have an IOAS ID input in their ABI. Only attaching a |
| * group should cause a default creation of the internal ioas, this returns the |
| * existing ioas if it has already been assigned somehow. |
| */ |
| int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) |
| { |
| struct iommufd_ioas *ioas = NULL; |
| struct iommufd_ioas *out_ioas; |
| |
| ioas = iommufd_ioas_alloc(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| xa_lock(&ictx->objects); |
| if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) |
| out_ioas = ictx->vfio_ioas; |
| else { |
| out_ioas = ioas; |
| ictx->vfio_ioas = ioas; |
| } |
| xa_unlock(&ictx->objects); |
| |
| *out_ioas_id = out_ioas->obj.id; |
| if (out_ioas != ioas) { |
| iommufd_put_object(&out_ioas->obj); |
| iommufd_object_abort(ictx, &ioas->obj); |
| return 0; |
| } |
| /* |
| * An automatically created compat IOAS is treated as a userspace |
| * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, |
| * and if not manually destroyed it will be destroyed automatically |
| * at iommufd release. |
| */ |
| iommufd_object_finalize(ictx, &ioas->obj); |
| return 0; |
| } |
| EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO); |
| |
| int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) |
| { |
| struct iommu_vfio_ioas *cmd = ucmd->cmd; |
| struct iommufd_ioas *ioas; |
| |
| if (cmd->__reserved) |
| return -EOPNOTSUPP; |
| switch (cmd->op) { |
| case IOMMU_VFIO_IOAS_GET: |
| ioas = get_compat_ioas(ucmd->ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| cmd->ioas_id = ioas->obj.id; |
| iommufd_put_object(&ioas->obj); |
| return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); |
| |
| case IOMMU_VFIO_IOAS_SET: |
| ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| xa_lock(&ucmd->ictx->objects); |
| ucmd->ictx->vfio_ioas = ioas; |
| xa_unlock(&ucmd->ictx->objects); |
| iommufd_put_object(&ioas->obj); |
| return 0; |
| |
| case IOMMU_VFIO_IOAS_CLEAR: |
| xa_lock(&ucmd->ictx->objects); |
| ucmd->ictx->vfio_ioas = NULL; |
| xa_unlock(&ucmd->ictx->objects); |
| return 0; |
| default: |
| return -EOPNOTSUPP; |
| } |
| } |
| |
| static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, |
| void __user *arg) |
| { |
| u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; |
| size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); |
| struct vfio_iommu_type1_dma_map map; |
| int iommu_prot = IOMMU_CACHE; |
| struct iommufd_ioas *ioas; |
| unsigned long iova; |
| int rc; |
| |
| if (copy_from_user(&map, arg, minsz)) |
| return -EFAULT; |
| |
| if (map.argsz < minsz || map.flags & ~supported_flags) |
| return -EINVAL; |
| |
| if (map.flags & VFIO_DMA_MAP_FLAG_READ) |
| iommu_prot |= IOMMU_READ; |
| if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) |
| iommu_prot |= IOMMU_WRITE; |
| |
| ioas = get_compat_ioas(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| /* |
| * Maps created through the legacy interface always use VFIO compatible |
| * rlimit accounting. If the user wishes to use the faster user based |
| * rlimit accounting then they must use the new interface. |
| */ |
| iova = map.iova; |
| rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), |
| map.size, iommu_prot, 0); |
| iommufd_put_object(&ioas->obj); |
| return rc; |
| } |
| |
| static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, |
| void __user *arg) |
| { |
| size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); |
| /* |
| * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new |
| * dirty tracking direction: |
| * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ |
| * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ |
| */ |
| u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; |
| struct vfio_iommu_type1_dma_unmap unmap; |
| unsigned long unmapped = 0; |
| struct iommufd_ioas *ioas; |
| int rc; |
| |
| if (copy_from_user(&unmap, arg, minsz)) |
| return -EFAULT; |
| |
| if (unmap.argsz < minsz || unmap.flags & ~supported_flags) |
| return -EINVAL; |
| |
| ioas = get_compat_ioas(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { |
| if (unmap.iova != 0 || unmap.size != 0) { |
| rc = -EINVAL; |
| goto err_put; |
| } |
| rc = iopt_unmap_all(&ioas->iopt, &unmapped); |
| } else { |
| if (READ_ONCE(ioas->iopt.disable_large_pages)) { |
| /* |
| * Create cuts at the start and last of the requested |
| * range. If the start IOVA is 0 then it doesn't need to |
| * be cut. |
| */ |
| unsigned long iovas[] = { unmap.iova + unmap.size - 1, |
| unmap.iova - 1 }; |
| |
| rc = iopt_cut_iova(&ioas->iopt, iovas, |
| unmap.iova ? 2 : 1); |
| if (rc) |
| goto err_put; |
| } |
| rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, |
| &unmapped); |
| } |
| unmap.size = unmapped; |
| if (copy_to_user(arg, &unmap, minsz)) |
| rc = -EFAULT; |
| |
| err_put: |
| iommufd_put_object(&ioas->obj); |
| return rc; |
| } |
| |
| static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) |
| { |
| struct iommufd_hw_pagetable *hwpt; |
| struct iommufd_ioas *ioas; |
| int rc = 1; |
| |
| ioas = get_compat_ioas(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| mutex_lock(&ioas->mutex); |
| list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { |
| if (!hwpt->enforce_cache_coherency) { |
| rc = 0; |
| break; |
| } |
| } |
| mutex_unlock(&ioas->mutex); |
| |
| iommufd_put_object(&ioas->obj); |
| return rc; |
| } |
| |
| static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, |
| unsigned long type) |
| { |
| switch (type) { |
| case VFIO_TYPE1_IOMMU: |
| case VFIO_TYPE1v2_IOMMU: |
| case VFIO_UNMAP_ALL: |
| return 1; |
| |
| case VFIO_DMA_CC_IOMMU: |
| return iommufd_vfio_cc_iommu(ictx); |
| |
| /* |
| * This is obsolete, and to be removed from VFIO. It was an incomplete |
| * idea that got merged. |
| * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ |
| */ |
| case VFIO_TYPE1_NESTING_IOMMU: |
| return 0; |
| |
| /* |
| * VFIO_DMA_MAP_FLAG_VADDR |
| * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ |
| * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ |
| * |
| * It is hard to see how this could be implemented safely. |
| */ |
| case VFIO_UPDATE_VADDR: |
| default: |
| return 0; |
| } |
| } |
| |
| static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) |
| { |
| struct iommufd_ioas *ioas = NULL; |
| int rc = 0; |
| |
| if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) |
| return -EINVAL; |
| |
| /* VFIO fails the set_iommu if there is no group */ |
| ioas = get_compat_ioas(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| /* |
| * The difference between TYPE1 and TYPE1v2 is the ability to unmap in |
| * the middle of mapped ranges. This is complicated by huge page support |
| * which creates single large IOPTEs that cannot be split by the iommu |
| * driver. TYPE1 is very old at this point and likely nothing uses it, |
| * however it is simple enough to emulate by simply disabling the |
| * problematic large IOPTEs. Then we can safely unmap within any range. |
| */ |
| if (type == VFIO_TYPE1_IOMMU) |
| rc = iopt_disable_large_pages(&ioas->iopt); |
| iommufd_put_object(&ioas->obj); |
| return rc; |
| } |
| |
| static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) |
| { |
| struct io_pagetable *iopt = &ioas->iopt; |
| unsigned long pgsize_bitmap = ULONG_MAX; |
| struct iommu_domain *domain; |
| unsigned long index; |
| |
| down_read(&iopt->domains_rwsem); |
| xa_for_each(&iopt->domains, index, domain) |
| pgsize_bitmap &= domain->pgsize_bitmap; |
| |
| /* See vfio_update_pgsize_bitmap() */ |
| if (pgsize_bitmap & ~PAGE_MASK) { |
| pgsize_bitmap &= PAGE_MASK; |
| pgsize_bitmap |= PAGE_SIZE; |
| } |
| pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); |
| up_read(&iopt->domains_rwsem); |
| return pgsize_bitmap; |
| } |
| |
| static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, |
| struct vfio_info_cap_header __user *cur, |
| size_t avail) |
| { |
| struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = |
| container_of(cur, |
| struct vfio_iommu_type1_info_cap_iova_range __user, |
| header); |
| struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { |
| .header = { |
| .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, |
| .version = 1, |
| }, |
| }; |
| struct interval_tree_span_iter span; |
| |
| interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, |
| ULONG_MAX) { |
| struct vfio_iova_range range; |
| |
| if (!span.is_hole) |
| continue; |
| range.start = span.start_hole; |
| range.end = span.last_hole; |
| if (avail >= struct_size(&cap_iovas, iova_ranges, |
| cap_iovas.nr_iovas + 1) && |
| copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], |
| &range, sizeof(range))) |
| return -EFAULT; |
| cap_iovas.nr_iovas++; |
| } |
| if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && |
| copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) |
| return -EFAULT; |
| return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); |
| } |
| |
| static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, |
| struct vfio_info_cap_header __user *cur, |
| size_t avail) |
| { |
| struct vfio_iommu_type1_info_dma_avail cap_dma = { |
| .header = { |
| .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, |
| .version = 1, |
| }, |
| /* |
| * iommufd's limit is based on the cgroup's memory limit. |
| * Normally vfio would return U16_MAX here, and provide a module |
| * parameter to adjust it. Since S390 qemu userspace actually |
| * pays attention and needs a value bigger than U16_MAX return |
| * U32_MAX. |
| */ |
| .avail = U32_MAX, |
| }; |
| |
| if (avail >= sizeof(cap_dma) && |
| copy_to_user(cur, &cap_dma, sizeof(cap_dma))) |
| return -EFAULT; |
| return sizeof(cap_dma); |
| } |
| |
| static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, |
| void __user *arg) |
| { |
| typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, |
| struct vfio_info_cap_header __user *cur, |
| size_t avail); |
| static const fill_cap_fn fill_fns[] = { |
| iommufd_fill_cap_dma_avail, |
| iommufd_fill_cap_iova, |
| }; |
| size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); |
| struct vfio_info_cap_header __user *last_cap = NULL; |
| struct vfio_iommu_type1_info info; |
| struct iommufd_ioas *ioas; |
| size_t total_cap_size; |
| int rc; |
| int i; |
| |
| if (copy_from_user(&info, arg, minsz)) |
| return -EFAULT; |
| |
| if (info.argsz < minsz) |
| return -EINVAL; |
| minsz = min_t(size_t, info.argsz, sizeof(info)); |
| |
| ioas = get_compat_ioas(ictx); |
| if (IS_ERR(ioas)) |
| return PTR_ERR(ioas); |
| |
| info.flags = VFIO_IOMMU_INFO_PGSIZES; |
| info.iova_pgsizes = iommufd_get_pagesizes(ioas); |
| info.cap_offset = 0; |
| |
| down_read(&ioas->iopt.iova_rwsem); |
| total_cap_size = sizeof(info); |
| for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { |
| int cap_size; |
| |
| if (info.argsz > total_cap_size) |
| cap_size = fill_fns[i](ioas, arg + total_cap_size, |
| info.argsz - total_cap_size); |
| else |
| cap_size = fill_fns[i](ioas, NULL, 0); |
| if (cap_size < 0) { |
| rc = cap_size; |
| goto out_put; |
| } |
| if (last_cap && info.argsz >= total_cap_size && |
| put_user(total_cap_size, &last_cap->next)) { |
| rc = -EFAULT; |
| goto out_put; |
| } |
| last_cap = arg + total_cap_size; |
| total_cap_size += cap_size; |
| } |
| |
| /* |
| * If the user did not provide enough space then only some caps are |
| * returned and the argsz will be updated to the correct amount to get |
| * all caps. |
| */ |
| if (info.argsz >= total_cap_size) |
| info.cap_offset = sizeof(info); |
| info.argsz = total_cap_size; |
| info.flags |= VFIO_IOMMU_INFO_CAPS; |
| if (copy_to_user(arg, &info, minsz)) { |
| rc = -EFAULT; |
| goto out_put; |
| } |
| rc = 0; |
| |
| out_put: |
| up_read(&ioas->iopt.iova_rwsem); |
| iommufd_put_object(&ioas->obj); |
| return rc; |
| } |
| |
| int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, |
| unsigned long arg) |
| { |
| void __user *uarg = (void __user *)arg; |
| |
| switch (cmd) { |
| case VFIO_GET_API_VERSION: |
| return VFIO_API_VERSION; |
| case VFIO_SET_IOMMU: |
| return iommufd_vfio_set_iommu(ictx, arg); |
| case VFIO_CHECK_EXTENSION: |
| return iommufd_vfio_check_extension(ictx, arg); |
| case VFIO_IOMMU_GET_INFO: |
| return iommufd_vfio_iommu_get_info(ictx, uarg); |
| case VFIO_IOMMU_MAP_DMA: |
| return iommufd_vfio_map_dma(ictx, cmd, uarg); |
| case VFIO_IOMMU_UNMAP_DMA: |
| return iommufd_vfio_unmap_dma(ictx, cmd, uarg); |
| case VFIO_IOMMU_DIRTY_PAGES: |
| default: |
| return -ENOIOCTLCMD; |
| } |
| return -ENOIOCTLCMD; |
| } |