diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 02912f180c6d..24ee2605b9f0 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -26,3 +26,7 @@ config VFIO_PCI_MMAP config VFIO_PCI_INTX depends on VFIO_PCI def_bool y if !S390 + +config VFIO_PCI_IGD + depends on VFIO_PCI + def_bool y if X86 diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 131079255fd9..76d8ec058edd 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,4 +1,5 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 8c80a48e3233..712a84978e97 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -111,6 +111,7 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) } static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); +static void vfio_pci_disable(struct vfio_pci_device *vdev); static int vfio_pci_enable(struct vfio_pci_device *vdev) { @@ -169,13 +170,26 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; + + if (vfio_pci_is_vga(pdev) && + pdev->vendor == PCI_VENDOR_ID_INTEL && + IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { + ret = vfio_pci_igd_init(vdev); + if (ret) { + dev_warn(&vdev->pdev->dev, + "Failed to setup Intel IGD regions\n"); + vfio_pci_disable(vdev); + return ret; + } + } + return 0; } static void vfio_pci_disable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; - int bar; + int i, bar; /* Stop the device from further DMA */ pci_clear_master(pdev); @@ -186,6 +200,13 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) vdev->virq_disabled = false; + for (i = 0; i < vdev->num_regions; i++) + vdev->region[i].ops->release(vdev, &vdev->region[i]); + + vdev->num_regions = 0; + kfree(vdev->region); + vdev->region = NULL; /* don't krealloc a freed pointer */ + vfio_config_free(vdev); for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { @@ -421,6 +442,93 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, return walk.ret; } +static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_sparse_mmap *sparse; + size_t end, size; + int nr_areas = 2, i = 0; + + end = pci_resource_len(vdev->pdev, vdev->msix_bar); + + /* If MSI-X table is aligned to the start or end, only one area */ + if (((vdev->msix_offset & PAGE_MASK) == 0) || + (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end)) + nr_areas = 1; + + size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); + + header = vfio_info_cap_add(caps, size, + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + sparse = container_of(header, + struct vfio_region_info_cap_sparse_mmap, header); + sparse->nr_areas = nr_areas; + + if (vdev->msix_offset & PAGE_MASK) { + sparse->areas[i].offset = 0; + sparse->areas[i].size = vdev->msix_offset & PAGE_MASK; + i++; + } + + if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) { + sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset + + vdev->msix_size); + sparse->areas[i].size = end - sparse->areas[i].offset; + i++; + } + + return 0; +} + +static int region_type_cap(struct vfio_pci_device *vdev, + struct vfio_info_cap *caps, + unsigned int type, unsigned int subtype) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *cap; + + header = vfio_info_cap_add(caps, sizeof(*cap), + VFIO_REGION_INFO_CAP_TYPE, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + cap = container_of(header, struct vfio_region_info_cap_type, header); + cap->type = type; + cap->subtype = subtype; + + return 0; +} + +int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data) +{ + struct vfio_pci_region *region; + + region = krealloc(vdev->region, + (vdev->num_regions + 1) * sizeof(*region), + GFP_KERNEL); + if (!region) + return -ENOMEM; + + vdev->region = region; + vdev->region[vdev->num_regions].type = type; + vdev->region[vdev->num_regions].subtype = subtype; + vdev->region[vdev->num_regions].ops = ops; + vdev->region[vdev->num_regions].size = size; + vdev->region[vdev->num_regions].flags = flags; + vdev->region[vdev->num_regions].data = data; + + vdev->num_regions++; + + return 0; +} + static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -443,7 +551,7 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; return copy_to_user((void __user *)arg, &info, minsz) ? @@ -452,6 +560,8 @@ static long vfio_pci_ioctl(void *device_data, } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + int i, ret; minsz = offsetofend(struct vfio_region_info, offset); @@ -480,8 +590,15 @@ static long vfio_pci_ioctl(void *device_data, VFIO_REGION_INFO_FLAG_WRITE; if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && pci_resource_flags(pdev, info.index) & - IORESOURCE_MEM && info.size >= PAGE_SIZE) + IORESOURCE_MEM && info.size >= PAGE_SIZE) { info.flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info.index == vdev->msix_bar) { + ret = msix_sparse_mmap_cap(vdev, &caps); + if (ret) + return ret; + } + } + break; case VFIO_PCI_ROM_REGION_INDEX: { @@ -493,8 +610,14 @@ static long vfio_pci_ioctl(void *device_data, /* Report the BAR size, not the ROM size */ info.size = pci_resource_len(pdev, info.index); - if (!info.size) - break; + if (!info.size) { + /* Shadow ROMs appear as PCI option ROMs */ + if (pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW) + info.size = 0x20000; + else + break; + } /* Is it really there? */ io = pci_map_rom(pdev, &size); @@ -518,7 +641,40 @@ static long vfio_pci_ioctl(void *device_data, break; default: - return -EINVAL; + if (info.index >= + VFIO_PCI_NUM_REGIONS + vdev->num_regions) + return -EINVAL; + + i = info.index - VFIO_PCI_NUM_REGIONS; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vdev->region[i].size; + info.flags = vdev->region[i].flags; + + ret = region_type_cap(vdev, &caps, + vdev->region[i].type, + vdev->region[i].subtype); + if (ret) + return ret; + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + + sizeof(info), caps.buf, + caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + + kfree(caps.buf); } return copy_to_user((void __user *)arg, &info, minsz) ? @@ -798,7 +954,7 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct vfio_pci_device *vdev = device_data; - if (index >= VFIO_PCI_NUM_REGIONS) + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; switch (index) { @@ -815,6 +971,10 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, case VFIO_PCI_VGA_REGION_INDEX: return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + default: + index -= VFIO_PCI_NUM_REGIONS; + return vdev->region[index].ops->rw(vdev, buf, + count, ppos, iswrite); } return -EINVAL; @@ -997,6 +1157,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) return; vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); + kfree(vdev->region); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index fe2b470d7ec6..142c533efec7 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -33,9 +33,8 @@ #define PCI_CFG_SPACE_SIZE 256 -/* Useful "pseudo" capabilities */ +/* Fake capability ID for standard config space */ #define PCI_CAP_ID_BASIC 0 -#define PCI_CAP_ID_INVALID 0xFF #define is_bar(offset) \ ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ @@ -301,6 +300,23 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, return count; } +/* Virt access uses only virtualization */ +static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + memcpy(vdev->vconfig + pos, &val, count); + return count; +} + +static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + memcpy(val, vdev->vconfig + pos, count); + return count; +} + /* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } @@ -319,6 +335,11 @@ static struct perm_bits unassigned_perms = { .writefn = vfio_raw_config_write }; +static struct perm_bits virt_perms = { + .readfn = vfio_virt_config_read, + .writefn = vfio_virt_config_write +}; + static void free_perm_bits(struct perm_bits *perm) { kfree(perm->virt); @@ -454,14 +475,19 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev) bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; /* - * NB. we expose the actual BAR size here, regardless of whether - * we can read it. When we report the REGION_INFO for the ROM - * we report what PCI tells us is the actual ROM size. + * NB. REGION_INFO will have reported zero size if we weren't able + * to read the ROM, but we still return the actual BAR size here if + * it exists (or the shadow ROM space). */ if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *bar &= cpu_to_le32((u32)mask); + } else if (pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW) { + mask = ~(0x20000 - 1); + mask |= PCI_ROM_ADDRESS_ENABLE; + *bar &= cpu_to_le32((u32)mask); } else *bar = 0; @@ -1332,6 +1358,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) pos + i, map[pos + i], cap); } + BUILD_BUG_ON(PCI_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); + memset(map + pos, cap, len); ret = vfio_fill_vconfig_bytes(vdev, pos, len); if (ret) @@ -1419,9 +1447,9 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) /* * Even though ecap is 2 bytes, we're currently a long way * from exceeding 1 byte capabilities. If we ever make it - * up to 0xFF we'll need to up this to a two-byte, byte map. + * up to 0xFE we'll need to up this to a two-byte, byte map. */ - BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); + BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); memset(map + epos, ecap, len); ret = vfio_fill_vconfig_bytes(vdev, epos, len); @@ -1597,6 +1625,9 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, if (cap_id == PCI_CAP_ID_INVALID) { perm = &unassigned_perms; cap_start = *ppos; + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { + perm = &virt_perms; + cap_start = *ppos; } else { if (*ppos >= PCI_CFG_SPACE_SIZE) { WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c new file mode 100644 index 000000000000..6394b168ef29 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -0,0 +1,280 @@ +/* + * VFIO PCI Intel Graphics support + * + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Register a device specific region through which to provide read-only + * access to the Intel IGD opregion. The register defining the opregion + * address is also virtualized to prevent user modification. + */ + +#include +#include +#include +#include + +#include "vfio_pci_private.h" + +#define OPREGION_SIGNATURE "IntelGraphicsMem" +#define OPREGION_SIZE (8 * 1024) +#define OPREGION_PCI_ADDR 0xfc + +static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + void *base = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (pos >= vdev->region[i].size || iswrite) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[i].size - pos)); + + if (copy_to_user(buf, base + pos, count)) + return -EFAULT; + + *ppos += count; + + return count; +} + +static void vfio_pci_igd_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + memunmap(region->data); +} + +static const struct vfio_pci_regops vfio_pci_igd_regops = { + .rw = vfio_pci_igd_rw, + .release = vfio_pci_igd_release, +}; + +static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) +{ + __le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR); + u32 addr, size; + void *base; + int ret; + + ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr); + if (ret) + return ret; + + if (!addr || !(~addr)) + return -ENODEV; + + base = memremap(addr, OPREGION_SIZE, MEMREMAP_WB); + if (!base) + return -ENOMEM; + + if (memcmp(base, OPREGION_SIGNATURE, 16)) { + memunmap(base); + return -EINVAL; + } + + size = le32_to_cpu(*(__le32 *)(base + 16)); + if (!size) { + memunmap(base); + return -EINVAL; + } + + size *= 1024; /* In KB */ + + if (size != OPREGION_SIZE) { + memunmap(base); + base = memremap(addr, size, MEMREMAP_WB); + if (!base) + return -ENOMEM; + } + + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, + &vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, base); + if (ret) { + memunmap(base); + return ret; + } + + /* Fill vconfig with the hw value and virtualize register */ + *dwordp = cpu_to_le32(addr); + memset(vdev->pci_config_map + OPREGION_PCI_ADDR, + PCI_CAP_ID_INVALID_VIRT, 4); + + return ret; +} + +static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct pci_dev *pdev = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t size; + int ret; + + if (pos >= vdev->region[i].size || iswrite) + return -EINVAL; + + size = count = min(count, (size_t)(vdev->region[i].size - pos)); + + if ((pos & 1) && size) { + u8 val; + + ret = pci_user_read_config_byte(pdev, pos, &val); + if (ret) + return pcibios_err_to_errno(ret); + + if (copy_to_user(buf + count - size, &val, 1)) + return -EFAULT; + + pos++; + size--; + } + + if ((pos & 3) && size > 2) { + u16 val; + + ret = pci_user_read_config_word(pdev, pos, &val); + if (ret) + return pcibios_err_to_errno(ret); + + val = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &val, 2)) + return -EFAULT; + + pos += 2; + size -= 2; + } + + while (size > 3) { + u32 val; + + ret = pci_user_read_config_dword(pdev, pos, &val); + if (ret) + return pcibios_err_to_errno(ret); + + val = cpu_to_le32(val); + if (copy_to_user(buf + count - size, &val, 4)) + return -EFAULT; + + pos += 4; + size -= 4; + } + + while (size >= 2) { + u16 val; + + ret = pci_user_read_config_word(pdev, pos, &val); + if (ret) + return pcibios_err_to_errno(ret); + + val = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &val, 2)) + return -EFAULT; + + pos += 2; + size -= 2; + } + + while (size) { + u8 val; + + ret = pci_user_read_config_byte(pdev, pos, &val); + if (ret) + return pcibios_err_to_errno(ret); + + if (copy_to_user(buf + count - size, &val, 1)) + return -EFAULT; + + pos++; + size--; + } + + *ppos += count; + + return count; +} + +static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct pci_dev *pdev = region->data; + + pci_dev_put(pdev); +} + +static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = { + .rw = vfio_pci_igd_cfg_rw, + .release = vfio_pci_igd_cfg_release, +}; + +static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *host_bridge, *lpc_bridge; + int ret; + + host_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0)); + if (!host_bridge) + return -ENODEV; + + if (host_bridge->vendor != PCI_VENDOR_ID_INTEL || + host_bridge->class != (PCI_CLASS_BRIDGE_HOST << 8)) { + pci_dev_put(host_bridge); + return -EINVAL; + } + + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, + &vfio_pci_igd_cfg_regops, host_bridge->cfg_size, + VFIO_REGION_INFO_FLAG_READ, host_bridge); + if (ret) { + pci_dev_put(host_bridge); + return ret; + } + + lpc_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x1f, 0)); + if (!lpc_bridge) + return -ENODEV; + + if (lpc_bridge->vendor != PCI_VENDOR_ID_INTEL || + lpc_bridge->class != (PCI_CLASS_BRIDGE_ISA << 8)) { + pci_dev_put(lpc_bridge); + return -EINVAL; + } + + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, + &vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size, + VFIO_REGION_INFO_FLAG_READ, lpc_bridge); + if (ret) { + pci_dev_put(lpc_bridge); + return ret; + } + + return 0; +} + +int vfio_pci_igd_init(struct vfio_pci_device *vdev) +{ + int ret; + + ret = vfio_pci_igd_opregion_init(vdev); + if (ret) + return ret; + + ret = vfio_pci_igd_cfg_init(vdev); + if (ret) + return ret; + + return 0; +} diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 3b3ba15558b7..e9ea3fef144a 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -309,14 +309,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; - int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; - char *name = msix ? "vfio-msix" : "vfio-msi"; struct eventfd_ctx *trigger; - int ret; + int irq, ret; - if (vector >= vdev->num_ctx) + if (vector < 0 || vector >= vdev->num_ctx) return -EINVAL; + irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; + if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); irq_bypass_unregister_producer(&vdev->ctx[vector].producer); @@ -328,8 +328,9 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, if (fd < 0) return 0; - vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", - name, vector, pci_name(pdev)); + vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)", + msix ? "x" : "", vector, + pci_name(pdev)); if (!vdev->ctx[vector].name) return -ENOMEM; @@ -379,7 +380,7 @@ static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, { int i, j, ret = 0; - if (start + count > vdev->num_ctx) + if (start >= vdev->num_ctx || start + count > vdev->num_ctx) return -EINVAL; for (i = 0, j = start; i < count && !ret; i++, j++) { @@ -388,7 +389,7 @@ static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, } if (ret) { - for (--j; j >= start; j--) + for (--j; j >= (int)start; j--) vfio_msi_set_vector_signal(vdev, j, -1, msix); } diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 0e7394f8f69b..8a7d546d18a0 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -24,6 +25,10 @@ #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) +/* Special capability IDs predefined access */ +#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ +#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ + struct vfio_pci_irq_ctx { struct eventfd_ctx *trigger; struct virqfd *unmask; @@ -33,6 +38,25 @@ struct vfio_pci_irq_ctx { struct irq_bypass_producer producer; }; +struct vfio_pci_device; +struct vfio_pci_region; + +struct vfio_pci_regops { + size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + void (*release)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region); +}; + +struct vfio_pci_region { + u32 type; + u32 subtype; + const struct vfio_pci_regops *ops; + void *data; + size_t size; + u32 flags; +}; + struct vfio_pci_device { struct pci_dev *pdev; void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; @@ -45,6 +69,8 @@ struct vfio_pci_device { struct vfio_pci_irq_ctx *ctx; int num_ctx; int irq_type; + int num_regions; + struct vfio_pci_region *region; u8 msi_qmax; u8 msix_bar; u16 msix_size; @@ -91,4 +117,17 @@ extern void vfio_pci_uninit_perm_bits(void); extern int vfio_config_init(struct vfio_pci_device *vdev); extern void vfio_config_free(struct vfio_pci_device *vdev); + +extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data); +#ifdef CONFIG_VFIO_PCI_IGD +extern int vfio_pci_igd_init(struct vfio_pci_device *vdev); +#else +static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) +{ + return -ENODEV; +} +#endif #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 210db24d2204..5ffd1d9ad4bd 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -124,11 +124,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, void __iomem *io; ssize_t done; - if (!pci_resource_start(pdev, bar)) + if (pci_resource_start(pdev, bar)) + end = pci_resource_len(pdev, bar); + else if (bar == PCI_ROM_RESOURCE && + pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) + end = 0x20000; + else return -EINVAL; - end = pci_resource_len(pdev, bar); - if (pos >= end) return -EINVAL; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index ecca316386f5..6fd6fa5469de 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1080,30 +1080,26 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, continue; } - /* module reference holds the driver we're working on */ - mutex_unlock(&vfio.iommu_drivers_lock); - data = driver->ops->open(arg); if (IS_ERR(data)) { ret = PTR_ERR(data); module_put(driver->ops->owner); - goto skip_drivers_unlock; + continue; } ret = __vfio_container_attach_groups(container, driver, data); - if (!ret) { - container->iommu_driver = driver; - container->iommu_data = data; - } else { + if (ret) { driver->ops->release(data); module_put(driver->ops->owner); + continue; } - goto skip_drivers_unlock; + container->iommu_driver = driver; + container->iommu_data = data; + break; } mutex_unlock(&vfio.iommu_drivers_lock); -skip_drivers_unlock: up_write(&container->group_lock); return ret; @@ -1732,6 +1728,60 @@ long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) } EXPORT_SYMBOL_GPL(vfio_external_check_extension); +/** + * Sub-module support + */ +/* + * Helper for managing a buffer of info chain capabilities, allocate or + * reallocate a buffer with additional @size, filling in @id and @version + * of the capability. A pointer to the new capability is returned. + * + * NB. The chain is based at the head of the buffer, so new entries are + * added to the tail, vfio_info_cap_shift() should be called to fixup the + * next offsets prior to copying to the user buffer. + */ +struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, + size_t size, u16 id, u16 version) +{ + void *buf; + struct vfio_info_cap_header *header, *tmp; + + buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); + if (!buf) { + kfree(caps->buf); + caps->size = 0; + return ERR_PTR(-ENOMEM); + } + + caps->buf = buf; + header = buf + caps->size; + + /* Eventually copied to user buffer, zero */ + memset(header, 0, size); + + header->id = id; + header->version = version; + + /* Add to the end of the capability chain */ + for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next) + ; /* nothing */ + + tmp->next = caps->size; + caps->size += size; + + return header; +} +EXPORT_SYMBOL_GPL(vfio_info_cap_add); + +void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) +{ + struct vfio_info_cap_header *tmp; + + for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset) + tmp->next += offset; +} +EXPORT_SYMBOL_GPL(vfio_info_cap_shift); + /** * Module/class support */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 610a86a892b8..0ecae0b1cd34 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -92,6 +92,17 @@ extern int vfio_external_user_iommu_id(struct vfio_group *group); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); +/* + * Sub-module helpers + */ +struct vfio_info_cap { + struct vfio_info_cap_header *buf; + size_t size; +}; +extern struct vfio_info_cap_header *vfio_info_cap_add( + struct vfio_info_cap *caps, size_t size, u16 id, u16 version); +extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); + struct pci_dev; #ifdef CONFIG_EEH extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 7d7a4c6f2090..255a2113f53c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -59,6 +59,33 @@ #define VFIO_TYPE (';') #define VFIO_BASE 100 +/* + * For extension of INFO ioctls, VFIO makes use of a capability chain + * designed after PCI/e capabilities. A flag bit indicates whether + * this capability chain is supported and a field defined in the fixed + * structure defines the offset of the first capability in the chain. + * This field is only valid when the corresponding bit in the flags + * bitmap is set. This offset field is relative to the start of the + * INFO buffer, as is the next field within each capability header. + * The id within the header is a shared address space per INFO ioctl, + * while the version field is specific to the capability id. The + * contents following the header are specific to the capability id. + */ +struct vfio_info_cap_header { + __u16 id; /* Identifies capability */ + __u16 version; /* Version specific to the capability ID */ + __u32 next; /* Offset of next capability */ +}; + +/* + * Callers of INFO ioctls passing insufficiently sized buffers will see + * the capability chain flag bit set, a zero value for the first capability + * offset (if available within the provided argsz), and argsz will be + * updated to report the necessary buffer size. For compatibility, the + * INFO ioctl will not report error in this case, but the capability chain + * will not be available. + */ + /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ /** @@ -194,13 +221,73 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ +#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ __u32 index; /* Region index */ - __u32 resv; /* Reserved for alignment */ + __u32 cap_offset; /* Offset within info struct of first cap */ __u64 size; /* Region size (bytes) */ __u64 offset; /* Region offset from start of device fd */ }; #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) +/* + * The sparse mmap capability allows finer granularity of specifying areas + * within a region with mmap support. When specified, the user should only + * mmap the offset ranges specified by the areas array. mmaps outside of the + * areas specified may fail (such as the range covering a PCI MSI-X table) or + * may result in improper device behavior. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 + +struct vfio_region_sparse_mmap_area { + __u64 offset; /* Offset of mmap'able area within region */ + __u64 size; /* Size of mmap'able area */ +}; + +struct vfio_region_info_cap_sparse_mmap { + struct vfio_info_cap_header header; + __u32 nr_areas; + __u32 reserved; + struct vfio_region_sparse_mmap_area areas[]; +}; + +/* + * The device specific type capability allows regions unique to a specific + * device or class of devices to be exposed. This helps solve the problem for + * vfio bus drivers of defining which region indexes correspond to which region + * on the device, without needing to resort to static indexes, as done by + * vfio-pci. For instance, if we were to go back in time, we might remove + * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes + * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd + * make a "VGA" device specific type to describe the VGA access space. This + * means that non-VGA devices wouldn't need to waste this index, and thus the + * address space associated with it due to implementation of device file + * descriptor offsets in vfio-pci. + * + * The current implementation is now part of the user ABI, so we can't use this + * for VGA, but there are other upcoming use cases, such as opregions for Intel + * IGD devices and framebuffers for vGPU devices. We missed VGA, but we'll + * use this for future additions. + * + * The structure below defines version 1 of this capability. + */ +#define VFIO_REGION_INFO_CAP_TYPE 2 + +struct vfio_region_info_cap_type { + struct vfio_info_cap_header header; + __u32 type; /* global per bus driver */ + __u32 subtype; /* type specific */ +}; + +#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31) +#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) + +/* 8086 Vendor sub-types */ +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) @@ -336,7 +423,8 @@ enum { * between described ranges are unimplemented. */ VFIO_PCI_VGA_REGION_INDEX, - VFIO_PCI_NUM_REGIONS + VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */ + /* device specific cap to define content. */ }; enum {