diff --git a/MAINTAINERS b/MAINTAINERS index 83f5a6fd7de3..7e48624f4f9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14797,7 +14797,7 @@ F: include/linux/mdev.h F: samples/vfio-mdev/ VFIO PLATFORM DRIVER -M: Baptiste Reynal +M: Eric Auger L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/platform/ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 8a1508a8e481..b423a309a6e0 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -302,6 +302,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; int i, bar; /* Stop the device from further DMA */ @@ -311,6 +312,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, vdev->irq_type, 0, 0, NULL); + /* Device closed, don't need mutex here */ + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, + &vdev->ioeventfds_list, next) { + vfio_virqfd_disable(&ioeventfd->virqfd); + list_del(&ioeventfd->next); + kfree(ioeventfd); + } + vdev->ioeventfds_nr = 0; + vdev->virq_disabled = false; for (i = 0; i < vdev->num_regions; i++) @@ -1009,6 +1019,28 @@ hot_reset_release: kfree(groups); return ret; + } else if (cmd == VFIO_DEVICE_IOEVENTFD) { + struct vfio_device_ioeventfd ioeventfd; + int count; + + minsz = offsetofend(struct vfio_device_ioeventfd, fd); + + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + return -EFAULT; + + if (ioeventfd.argsz < minsz) + return -EINVAL; + + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) + return -EINVAL; + + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; + + if (hweight8(count) != 1 || ioeventfd.fd < -1) + return -EINVAL; + + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, + ioeventfd.data, count, ioeventfd.fd); } return -ENOTTY; @@ -1171,6 +1203,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); + mutex_init(&vdev->ioeventfds_lock); + INIT_LIST_HEAD(&vdev->ioeventfds_list); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -1212,6 +1246,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); kfree(vdev->region); + mutex_destroy(&vdev->ioeventfds_lock); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index f561ac1c78a0..cde3b5d3441a 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -29,6 +29,19 @@ #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ +/* Cap maximum number of ioeventfds per device (arbitrary) */ +#define VFIO_PCI_IOEVENTFD_MAX 1000 + +struct vfio_pci_ioeventfd { + struct list_head next; + struct virqfd *virqfd; + void __iomem *addr; + uint64_t data; + loff_t pos; + int bar; + int count; +}; + struct vfio_pci_irq_ctx { struct eventfd_ctx *trigger; struct virqfd *unmask; @@ -92,9 +105,12 @@ struct vfio_pci_device { bool nointx; struct pci_saved_state *pci_saved_state; int refcnt; + int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; struct list_head dummy_resources_list; + struct mutex ioeventfds_lock; + struct list_head ioeventfds_list; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) @@ -120,6 +136,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, + uint64_t data, int count, int fd); + extern int vfio_pci_init_perm_bits(void); extern void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 357243d76f10..a6029d0a5524 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -17,10 +17,29 @@ #include #include #include +#include #include #include "vfio_pci_private.h" +#ifdef __LITTLE_ENDIAN +#define vfio_ioread64 ioread64 +#define vfio_iowrite64 iowrite64 +#define vfio_ioread32 ioread32 +#define vfio_iowrite32 iowrite32 +#define vfio_ioread16 ioread16 +#define vfio_iowrite16 iowrite16 +#else +#define vfio_ioread64 ioread64be +#define vfio_iowrite64 iowrite64be +#define vfio_ioread32 ioread32be +#define vfio_iowrite32 iowrite32be +#define vfio_ioread16 ioread16be +#define vfio_iowrite16 iowrite16be +#endif +#define vfio_ioread8 ioread8 +#define vfio_iowrite8 iowrite8 + /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded * range which is inaccessible. The excluded range drops writes and fills @@ -44,15 +63,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, fillable = 0; if (fillable >= 4 && !(off % 4)) { - __le32 val; + u32 val; if (iswrite) { if (copy_from_user(&val, buf, 4)) return -EFAULT; - iowrite32(le32_to_cpu(val), io + off); + vfio_iowrite32(val, io + off); } else { - val = cpu_to_le32(ioread32(io + off)); + val = vfio_ioread32(io + off); if (copy_to_user(buf, &val, 4)) return -EFAULT; @@ -60,15 +79,15 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, filled = 4; } else if (fillable >= 2 && !(off % 2)) { - __le16 val; + u16 val; if (iswrite) { if (copy_from_user(&val, buf, 2)) return -EFAULT; - iowrite16(le16_to_cpu(val), io + off); + vfio_iowrite16(val, io + off); } else { - val = cpu_to_le16(ioread16(io + off)); + val = vfio_ioread16(io + off); if (copy_to_user(buf, &val, 2)) return -EFAULT; @@ -82,9 +101,9 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, if (copy_from_user(&val, buf, 1)) return -EFAULT; - iowrite8(val, io + off); + vfio_iowrite8(val, io + off); } else { - val = ioread8(io + off); + val = vfio_ioread8(io + off); if (copy_to_user(buf, &val, 1)) return -EFAULT; @@ -113,6 +132,30 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf, return done; } +static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + void __iomem *io; + + if (vdev->barmap[bar]) + return 0; + + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + if (ret) + return ret; + + io = pci_iomap(pdev, bar, 0); + if (!io) { + pci_release_selected_regions(pdev, 1 << bar); + return -ENOMEM; + } + + vdev->barmap[bar] = io; + + return 0; +} + ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -147,22 +190,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, if (!io) return -ENOMEM; x_end = end; - } else if (!vdev->barmap[bar]) { - int ret; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + } else { + int ret = vfio_pci_setup_barmap(vdev, bar); if (ret) return ret; - io = pci_iomap(pdev, bar, 0); - if (!io) { - pci_release_selected_regions(pdev, 1 << bar); - return -ENOMEM; - } - - vdev->barmap[bar] = io; - } else io = vdev->barmap[bar]; + } if (bar == vdev->msix_bar) { x_start = vdev->msix_offset; @@ -242,3 +276,113 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, return done; } + +static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) +{ + struct vfio_pci_ioeventfd *ioeventfd = opaque; + + switch (ioeventfd->count) { + case 1: + vfio_iowrite8(ioeventfd->data, ioeventfd->addr); + break; + case 2: + vfio_iowrite16(ioeventfd->data, ioeventfd->addr); + break; + case 4: + vfio_iowrite32(ioeventfd->data, ioeventfd->addr); + break; +#ifdef iowrite64 + case 8: + vfio_iowrite64(ioeventfd->data, ioeventfd->addr); + break; +#endif + } + + return 0; +} + +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, + uint64_t data, int count, int fd) +{ + struct pci_dev *pdev = vdev->pdev; + loff_t pos = offset & VFIO_PCI_OFFSET_MASK; + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); + struct vfio_pci_ioeventfd *ioeventfd; + + /* Only support ioeventfds into BARs */ + if (bar > VFIO_PCI_BAR5_REGION_INDEX) + return -EINVAL; + + if (pos + count > pci_resource_len(pdev, bar)) + return -EINVAL; + + /* Disallow ioeventfds working around MSI-X table writes */ + if (bar == vdev->msix_bar && + !(pos + count <= vdev->msix_offset || + pos >= vdev->msix_offset + vdev->msix_size)) + return -EINVAL; + +#ifndef iowrite64 + if (count == 8) + return -EINVAL; +#endif + + ret = vfio_pci_setup_barmap(vdev, bar); + if (ret) + return ret; + + mutex_lock(&vdev->ioeventfds_lock); + + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) { + if (ioeventfd->pos == pos && ioeventfd->bar == bar && + ioeventfd->data == data && ioeventfd->count == count) { + if (fd == -1) { + vfio_virqfd_disable(&ioeventfd->virqfd); + list_del(&ioeventfd->next); + vdev->ioeventfds_nr--; + kfree(ioeventfd); + ret = 0; + } else + ret = -EEXIST; + + goto out_unlock; + } + } + + if (fd < 0) { + ret = -ENODEV; + goto out_unlock; + } + + if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) { + ret = -ENOSPC; + goto out_unlock; + } + + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); + if (!ioeventfd) { + ret = -ENOMEM; + goto out_unlock; + } + + ioeventfd->addr = vdev->barmap[bar] + pos; + ioeventfd->data = data; + ioeventfd->pos = pos; + ioeventfd->bar = bar; + ioeventfd->count = count; + + ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler, + NULL, NULL, &ioeventfd->virqfd, fd); + if (ret) { + kfree(ioeventfd); + goto out_unlock; + } + + list_add(&ioeventfd->next, &vdev->ioeventfds_list); + vdev->ioeventfds_nr++; + +out_unlock: + mutex_unlock(&vdev->ioeventfds_lock); + + return ret; +} diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 45657e2b1ff7..5c212bf29640 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -102,6 +102,13 @@ struct vfio_pfn { atomic_t ref_count; }; +struct vfio_regions { + struct list_head list; + dma_addr_t iova; + phys_addr_t phys; + size_t len; +}; + #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ (!list_empty(&iommu->domain_list)) @@ -397,7 +404,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, { unsigned long pfn = 0; long ret, pinned = 0, lock_acct = 0; - bool rsvd; dma_addr_t iova = vaddr - dma->vaddr + dma->iova; /* This code path is only user initiated */ @@ -408,14 +414,23 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, if (ret) return ret; + if (is_invalid_reserved_pfn(*pfn_base)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); + pinned = min_t(long, npage, vma_pages(vma)); + up_read(¤t->mm->mmap_sem); + return pinned; + } + pinned++; - rsvd = is_invalid_reserved_pfn(*pfn_base); /* * Reserved pages aren't counted against the user, externally pinned * pages are already counted against the user. */ - if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!vfio_find_vpfn(dma, iova)) { if (!lock_cap && current->mm->locked_vm + 1 > limit) { put_pfn(*pfn_base, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, @@ -435,13 +450,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, if (ret) break; - if (pfn != *pfn_base + pinned || - rsvd != is_invalid_reserved_pfn(pfn)) { + if (pfn != *pfn_base + pinned) { put_pfn(pfn, dma->prot); break; } - if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!vfio_find_vpfn(dma, iova)) { if (!lock_cap && current->mm->locked_vm + lock_acct + 1 > limit) { put_pfn(pfn, dma->prot); @@ -459,10 +473,8 @@ out: unpin_out: if (ret) { - if (!rsvd) { - for (pfn = *pfn_base ; pinned ; pfn++, pinned--) - put_pfn(pfn, dma->prot); - } + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) + put_pfn(pfn, dma->prot); return ret; } @@ -660,11 +672,102 @@ unpin_exit: return i > npage ? npage : (i > 0 ? i : -EINVAL); } +static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, + struct list_head *regions) +{ + long unlocked = 0; + struct vfio_regions *entry, *next; + + iommu_tlb_sync(domain->domain); + + list_for_each_entry_safe(entry, next, regions, list) { + unlocked += vfio_unpin_pages_remote(dma, + entry->iova, + entry->phys >> PAGE_SHIFT, + entry->len >> PAGE_SHIFT, + false); + list_del(&entry->list); + kfree(entry); + } + + cond_resched(); + + return unlocked; +} + +/* + * Generally, VFIO needs to unpin remote pages after each IOTLB flush. + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track + * of these regions (currently using a list). + * + * This value specifies maximum number of regions for each IOTLB flush sync. + */ +#define VFIO_IOMMU_TLB_SYNC_MAX 512 + +static size_t unmap_unpin_fast(struct vfio_domain *domain, + struct vfio_dma *dma, dma_addr_t *iova, + size_t len, phys_addr_t phys, long *unlocked, + struct list_head *unmapped_list, + int *unmapped_cnt) +{ + size_t unmapped = 0; + struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); + + if (entry) { + unmapped = iommu_unmap_fast(domain->domain, *iova, len); + + if (!unmapped) { + kfree(entry); + } else { + iommu_tlb_range_add(domain->domain, *iova, unmapped); + entry->iova = *iova; + entry->phys = phys; + entry->len = unmapped; + list_add_tail(&entry->list, unmapped_list); + + *iova += unmapped; + (*unmapped_cnt)++; + } + } + + /* + * Sync if the number of fast-unmap regions hits the limit + * or in case of errors. + */ + if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { + *unlocked += vfio_sync_unpin(dma, domain, + unmapped_list); + *unmapped_cnt = 0; + } + + return unmapped; +} + +static size_t unmap_unpin_slow(struct vfio_domain *domain, + struct vfio_dma *dma, dma_addr_t *iova, + size_t len, phys_addr_t phys, + long *unlocked) +{ + size_t unmapped = iommu_unmap(domain->domain, *iova, len); + + if (unmapped) { + *unlocked += vfio_unpin_pages_remote(dma, *iova, + phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + false); + *iova += unmapped; + cond_resched(); + } + return unmapped; +} + static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, bool do_accounting) { dma_addr_t iova = dma->iova, end = dma->iova + dma->size; struct vfio_domain *domain, *d; + LIST_HEAD(unmapped_region_list); + int unmapped_region_cnt = 0; long unlocked = 0; if (!dma->size) @@ -710,20 +813,26 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, break; } - unmapped = iommu_unmap(domain->domain, iova, len); - if (WARN_ON(!unmapped)) - break; - - unlocked += vfio_unpin_pages_remote(dma, iova, - phys >> PAGE_SHIFT, - unmapped >> PAGE_SHIFT, - false); - iova += unmapped; - - cond_resched(); + /* + * First, try to use fast unmap/unpin. In case of failure, + * switch to slow unmap/unpin path. + */ + unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, + &unlocked, &unmapped_region_list, + &unmapped_region_cnt); + if (!unmapped) { + unmapped = unmap_unpin_slow(domain, dma, &iova, len, + phys, &unlocked); + if (WARN_ON(!unmapped)) + break; + } } dma->iommu_mapped = false; + + if (unmapped_region_cnt) + unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); + if (do_accounting) { vfio_lock_acct(dma->task, -unlocked, NULL); return 0; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c74372163ed2..1aa7b82e8169 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -575,6 +575,33 @@ struct vfio_device_gfx_plane_info { #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) +/** + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, + * struct vfio_device_ioeventfd) + * + * Perform a write to the device at the specified device fd offset, with + * the specified data and width when the provided eventfd is triggered. + * vfio bus drivers may not support this for all regions, for all widths, + * or at all. vfio-pci currently only enables support for BAR regions, + * excluding the MSI-X vector table. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_ioeventfd { + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) + __u64 offset; /* device fd offset of write */ + __u64 data; /* data to be written */ + __s32 fd; /* -1 for de-assignment */ +}; + +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) + /* -------- API for Type1 VFIO IOMMU -------- */ /** diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 09f255bdf3ac..7abb79d8313d 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -534,7 +534,7 @@ static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, /* Interrupt priority 2: Fifo trigger level reached */ if ((ier & UART_IER_RDI) && - (mdev_state->s[index].rxtx.count == + (mdev_state->s[index].rxtx.count >= mdev_state->s[index].intr_trigger_level)) *buf |= UART_IIR_RDI;