diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index cbe6a890a93a..e1738f666417 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -134,6 +134,16 @@ config INTEL_IOMMU and include PCI device scope covered by these DMA remapping devices. +config INTEL_IOMMU_SVM + bool "Support for Shared Virtual Memory with Intel IOMMU" + depends on INTEL_IOMMU && X86 + select PCI_PASID + select MMU_NOTIFIER + help + Shared Virtual Memory (SVM) provides a facility for devices + to access DMA resources through process address space by + means of a Process Address Space ID (PASID). + config INTEL_IOMMU_DEFAULT_ON def_bool y prompt "Enable Intel DMA Remapping Devices by default" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index c6dcc513d711..dc6f511f45a3 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o +obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 8757f8dfc4e5..80e3c176008e 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1086,6 +1086,11 @@ static void free_iommu(struct intel_iommu *iommu) iommu_device_destroy(iommu->iommu_dev); if (iommu->irq) { + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } free_irq(iommu->irq, iommu); dmar_free_hwirq(iommu->irq); iommu->irq = 0; @@ -1493,53 +1498,68 @@ static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) } } + +static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq) +{ + if (iommu->irq == irq) + return DMAR_FECTL_REG; + else if (iommu->pr_irq == irq) + return DMAR_PECTL_REG; + else + BUG(); +} + void dmar_msi_unmask(struct irq_data *data) { struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); + int reg = dmar_msi_reg(iommu, data->irq); unsigned long flag; /* unmask it */ raw_spin_lock_irqsave(&iommu->register_lock, flag); - writel(0, iommu->reg + DMAR_FECTL_REG); + writel(0, iommu->reg + reg); /* Read a reg to force flush the post write */ - readl(iommu->reg + DMAR_FECTL_REG); + readl(iommu->reg + reg); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } void dmar_msi_mask(struct irq_data *data) { - unsigned long flag; struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); + int reg = dmar_msi_reg(iommu, data->irq); + unsigned long flag; /* mask it */ raw_spin_lock_irqsave(&iommu->register_lock, flag); - writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG); + writel(DMA_FECTL_IM, iommu->reg + reg); /* Read a reg to force flush the post write */ - readl(iommu->reg + DMAR_FECTL_REG); + readl(iommu->reg + reg); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } void dmar_msi_write(int irq, struct msi_msg *msg) { struct intel_iommu *iommu = irq_get_handler_data(irq); + int reg = dmar_msi_reg(iommu, irq); unsigned long flag; raw_spin_lock_irqsave(&iommu->register_lock, flag); - writel(msg->data, iommu->reg + DMAR_FEDATA_REG); - writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG); - writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG); + writel(msg->data, iommu->reg + reg + 4); + writel(msg->address_lo, iommu->reg + reg + 8); + writel(msg->address_hi, iommu->reg + reg + 12); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } void dmar_msi_read(int irq, struct msi_msg *msg) { struct intel_iommu *iommu = irq_get_handler_data(irq); + int reg = dmar_msi_reg(iommu, irq); unsigned long flag; raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + DMAR_FEDATA_REG); - msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG); - msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG); + msg->data = readl(iommu->reg + reg + 4); + msg->address_lo = readl(iommu->reg + reg + 8); + msg->address_hi = readl(iommu->reg + reg + 12); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index d65cf42399e8..6a10d97f9f6d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -418,10 +418,13 @@ struct device_domain_info { struct list_head global; /* link to global list */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ - struct { - u8 enabled:1; - u8 qdep; - } ats; /* ATS state */ + u8 pasid_supported:3; + u8 pasid_enabled:1; + u8 pri_supported:1; + u8 pri_enabled:1; + u8 ats_supported:1; + u8 ats_enabled:1; + u8 ats_qdep; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ @@ -497,13 +500,37 @@ static int dmar_forcedac; static int intel_iommu_strict; static int intel_iommu_superpage = 1; static int intel_iommu_ecs = 1; +static int intel_iommu_pasid28; +static int iommu_identity_mapping; -/* We only actually use ECS when PASID support (on the new bit 40) - * is also advertised. Some early implementations — the ones with - * PASID support on bit 28 — have issues even when we *only* use - * extended root/context tables. */ +#define IDENTMAP_ALL 1 +#define IDENTMAP_GFX 2 +#define IDENTMAP_AZALIA 4 + +/* Broadwell and Skylake have broken ECS support — normal so-called "second + * level" translation of DMA requests-without-PASID doesn't actually happen + * unless you also set the NESTE bit in an extended context-entry. Which of + * course means that SVM doesn't work because it's trying to do nested + * translation of the physical addresses it finds in the process page tables, + * through the IOVA->phys mapping found in the "second level" page tables. + * + * The VT-d specification was retroactively changed to change the definition + * of the capability bits and pretend that Broadwell/Skylake never happened... + * but unfortunately the wrong bit was changed. It's ECS which is broken, but + * for some reason it was the PASID capability bit which was redefined (from + * bit 28 on BDW/SKL to bit 40 in future). + * + * So our test for ECS needs to eschew those implementations which set the old + * PASID capabiity bit 28, since those are the ones on which ECS is broken. + * Unless we are working around the 'pasid28' limitations, that is, by putting + * the device into passthrough mode for normal DMA and thus masking the bug. + */ #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \ - ecap_pasid(iommu->ecap)) + (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap))) +/* PASID support is thus enabled if ECS is enabled and *either* of the old + * or new capability bits are set. */ +#define pasid_enabled(iommu) (ecs_enabled(iommu) && \ + (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap))) int intel_iommu_gfx_mapped; EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); @@ -566,6 +593,11 @@ static int __init intel_iommu_setup(char *str) printk(KERN_INFO "Intel-IOMMU: disable extended context table support\n"); intel_iommu_ecs = 0; + } else if (!strncmp(str, "pasid28", 7)) { + printk(KERN_INFO + "Intel-IOMMU: enable pre-production PASID support\n"); + intel_iommu_pasid28 = 1; + iommu_identity_mapping |= IDENTMAP_GFX; } str += strcspn(str, ","); @@ -1407,37 +1439,22 @@ static struct device_domain_info * iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { - bool found = false; struct device_domain_info *info; - struct pci_dev *pdev; assert_spin_locked(&device_domain_lock); - if (!ecap_dev_iotlb_support(iommu->ecap)) - return NULL; - if (!iommu->qi) return NULL; list_for_each_entry(info, &domain->devices, link) if (info->iommu == iommu && info->bus == bus && info->devfn == devfn) { - found = true; + if (info->ats_supported && info->dev) + return info; break; } - if (!found || !info->dev || !dev_is_pci(info->dev)) - return NULL; - - pdev = to_pci_dev(info->dev); - - if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS)) - return NULL; - - if (!dmar_find_matched_atsr_unit(pdev)) - return NULL; - - return info; + return NULL; } static void iommu_enable_dev_iotlb(struct device_domain_info *info) @@ -1448,20 +1465,48 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) return; pdev = to_pci_dev(info->dev); - if (pci_enable_ats(pdev, VTD_PAGE_SHIFT)) - return; - info->ats.enabled = 1; - info->ats.qdep = pci_ats_queue_depth(pdev); +#ifdef CONFIG_INTEL_IOMMU_SVM + /* The PCIe spec, in its wisdom, declares that the behaviour of + the device if you enable PASID support after ATS support is + undefined. So always enable PASID support on devices which + have it, even if we can't yet know if we're ever going to + use it. */ + if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) + info->pasid_enabled = 1; + + if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) + info->pri_enabled = 1; +#endif + if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { + info->ats_enabled = 1; + info->ats_qdep = pci_ats_queue_depth(pdev); + } } static void iommu_disable_dev_iotlb(struct device_domain_info *info) { - if (!info->ats.enabled) + struct pci_dev *pdev; + + if (dev_is_pci(info->dev)) return; - pci_disable_ats(to_pci_dev(info->dev)); - info->ats.enabled = 0; + pdev = to_pci_dev(info->dev); + + if (info->ats_enabled) { + pci_disable_ats(pdev); + info->ats_enabled = 0; + } +#ifdef CONFIG_INTEL_IOMMU_SVM + if (info->pri_enabled) { + pci_disable_pri(pdev); + info->pri_enabled = 0; + } + if (info->pasid_enabled) { + pci_disable_pasid(pdev); + info->pasid_enabled = 0; + } +#endif } static void iommu_flush_dev_iotlb(struct dmar_domain *domain, @@ -1473,11 +1518,11 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, spin_lock_irqsave(&device_domain_lock, flags); list_for_each_entry(info, &domain->devices, link) { - if (!info->ats.enabled) + if (!info->ats_enabled) continue; sid = info->bus << 8 | info->devfn; - qdep = info->ats.qdep; + qdep = info->ats_qdep; qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); } spin_unlock_irqrestore(&device_domain_lock, flags); @@ -1667,6 +1712,14 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); + +#ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_enabled(iommu)) { + if (ecap_prs(iommu->ecap)) + intel_svm_finish_prq(iommu); + intel_svm_free_pasid_tables(iommu); + } +#endif } static struct dmar_domain *alloc_domain(int flags) @@ -1934,8 +1987,10 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } info = iommu_support_dev_iotlb(domain, iommu, bus, devfn); - translation = info ? CONTEXT_TT_DEV_IOTLB : - CONTEXT_TT_MULTI_LEVEL; + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); context_set_address_width(context, iommu->agaw); @@ -2273,12 +2328,34 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, info->bus = bus; info->devfn = devfn; - info->ats.enabled = 0; - info->ats.qdep = 0; + info->ats_supported = info->pasid_supported = info->pri_supported = 0; + info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; + info->ats_qdep = 0; info->dev = dev; info->domain = domain; info->iommu = iommu; + if (dev && dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(info->dev); + + if (ecap_dev_iotlb_support(iommu->ecap) && + pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) && + dmar_find_matched_atsr_unit(pdev)) + info->ats_supported = 1; + + if (ecs_enabled(iommu)) { + if (pasid_enabled(iommu)) { + int features = pci_pasid_features(pdev); + if (features >= 0) + info->pasid_supported = features | 1; + } + + if (info->ats_supported && ecap_prs(iommu->ecap) && + pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI)) + info->pri_supported = 1; + } + } + spin_lock_irqsave(&device_domain_lock, flags); if (dev) found = find_domain(dev); @@ -2404,11 +2481,6 @@ found_domain: return domain; } -static int iommu_identity_mapping; -#define IDENTMAP_ALL 1 -#define IDENTMAP_GFX 2 -#define IDENTMAP_AZALIA 4 - static int iommu_domain_identity_map(struct dmar_domain *domain, unsigned long long start, unsigned long long end) @@ -3100,6 +3172,10 @@ static int __init init_dmars(void) if (!ecap_pass_through(iommu->ecap)) hw_pass_through = 0; +#ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_enabled(iommu)) + intel_svm_alloc_pasid_tables(iommu); +#endif } if (iommu_pass_through) @@ -3187,6 +3263,13 @@ domains_done: iommu_flush_write_buffer(iommu); +#ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) { + ret = intel_svm_enable_prq(iommu); + if (ret) + goto free_iommu; + } +#endif ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -4115,6 +4198,11 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (ret) goto out; +#ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_enabled(iommu)) + intel_svm_alloc_pasid_tables(iommu); +#endif + if (dmaru->ignored) { /* * we always have to disable PMRs or DMA may fail on this device @@ -4126,6 +4214,14 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); + +#ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) { + ret = intel_svm_enable_prq(iommu); + if (ret) + goto disable_iommu; + } +#endif ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -4194,14 +4290,17 @@ int dmar_find_matched_atsr_unit(struct pci_dev *dev) dev = pci_physfn(dev); for (bus = dev->bus; bus; bus = bus->parent) { bridge = bus->self; - if (!bridge || !pci_is_pcie(bridge) || + /* If it's an integrated device, allow ATS */ + if (!bridge) + return 1; + /* Connected via non-PCIe: no ATS */ + if (!pci_is_pcie(bridge) || pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) return 0; + /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break; } - if (!bridge) - return 0; rcu_read_lock(); list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { @@ -4865,6 +4964,114 @@ static void intel_iommu_remove_device(struct device *dev) iommu_device_unlink(iommu->iommu_dev, dev); } +#ifdef CONFIG_INTEL_IOMMU_SVM +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev) +{ + struct device_domain_info *info; + struct context_entry *context; + struct dmar_domain *domain; + unsigned long flags; + u64 ctx_lo; + int ret; + + domain = get_valid_domain_for_dev(sdev->dev); + if (!domain) + return -EINVAL; + + spin_lock_irqsave(&device_domain_lock, flags); + spin_lock(&iommu->lock); + + ret = -EINVAL; + info = sdev->dev->archdata.iommu; + if (!info || !info->pasid_supported) + goto out; + + context = iommu_context_addr(iommu, info->bus, info->devfn, 0); + if (WARN_ON(!context)) + goto out; + + ctx_lo = context[0].lo; + + sdev->did = domain->iommu_did[iommu->seq_id]; + sdev->sid = PCI_DEVID(info->bus, info->devfn); + + if (!(ctx_lo & CONTEXT_PASIDE)) { + context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table); + context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap); + wmb(); + /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both + * extended to permit requests-with-PASID if the PASIDE bit + * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH, + * however, the PASIDE bit is ignored and requests-with-PASID + * are unconditionally blocked. Which makes less sense. + * So convert from CONTEXT_TT_PASS_THROUGH to one of the new + * "guest mode" translation types depending on whether ATS + * is available or not. Annoyingly, we can't use the new + * modes *unless* PASIDE is set. */ + if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) { + ctx_lo &= ~CONTEXT_TT_MASK; + if (info->ats_supported) + ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2; + else + ctx_lo |= CONTEXT_TT_PT_PASID << 2; + } + ctx_lo |= CONTEXT_PASIDE; + if (iommu->pasid_state_table) + ctx_lo |= CONTEXT_DINVE; + if (info->pri_supported) + ctx_lo |= CONTEXT_PRS; + context[0].lo = ctx_lo; + wmb(); + iommu->flush.flush_context(iommu, sdev->did, sdev->sid, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + } + + /* Enable PASID support in the device, if it wasn't already */ + if (!info->pasid_enabled) + iommu_enable_dev_iotlb(info); + + if (info->ats_enabled) { + sdev->dev_iotlb = 1; + sdev->qdep = info->ats_qdep; + if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) + sdev->qdep = 0; + } + ret = 0; + + out: + spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&device_domain_lock, flags); + + return ret; +} + +struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) +{ + struct intel_iommu *iommu; + u8 bus, devfn; + + if (iommu_dummy(dev)) { + dev_warn(dev, + "No IOMMU translation for device; cannot enable SVM\n"); + return NULL; + } + + iommu = device_to_iommu(dev, &bus, &devfn); + if ((!iommu)) { + dev_err(dev, "No IOMMU for device; cannot enable SVM\n"); + return NULL; + } + + if (!iommu->pasid_table) { + dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n"); + return NULL; + } + + return iommu; +} +#endif /* CONFIG_INTEL_IOMMU_SVM */ + static const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c new file mode 100644 index 000000000000..c69e3f9ec958 --- /dev/null +++ b/drivers/iommu/intel-svm.c @@ -0,0 +1,602 @@ +/* + * Copyright © 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Authors: David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static irqreturn_t prq_event_thread(int irq, void *d); + +struct pasid_entry { + u64 val; +}; + +struct pasid_state_entry { + u64 val; +}; + +int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu) +{ + struct page *pages; + int order; + + order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT; + if (order < 0) + order = 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!pages) { + pr_warn("IOMMU: %s: Failed to allocate PASID table\n", + iommu->name); + return -ENOMEM; + } + iommu->pasid_table = page_address(pages); + pr_info("%s: Allocated order %d PASID table.\n", iommu->name, order); + + if (ecap_dis(iommu->ecap)) { + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (pages) + iommu->pasid_state_table = page_address(pages); + else + pr_warn("IOMMU: %s: Failed to allocate PASID state table\n", + iommu->name); + } + + idr_init(&iommu->pasid_idr); + + return 0; +} + +int intel_svm_free_pasid_tables(struct intel_iommu *iommu) +{ + int order; + + order = ecap_pss(iommu->ecap) + 7 - PAGE_SHIFT; + if (order < 0) + order = 0; + + if (iommu->pasid_table) { + free_pages((unsigned long)iommu->pasid_table, order); + iommu->pasid_table = NULL; + } + if (iommu->pasid_state_table) { + free_pages((unsigned long)iommu->pasid_state_table, order); + iommu->pasid_state_table = NULL; + } + idr_destroy(&iommu->pasid_idr); + return 0; +} + +#define PRQ_ORDER 0 + +int intel_svm_enable_prq(struct intel_iommu *iommu) +{ + struct page *pages; + int irq, ret; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); + if (!pages) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + iommu->prq = page_address(pages); + + irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + err: + free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + return ret; + } + iommu->pr_irq = irq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + dmar_free_hwirq(irq); + goto err; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + return 0; +} + +int intel_svm_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + + free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return 0; +} + +static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, + unsigned long address, unsigned long pages, int ih, int gl) +{ + struct qi_desc desc; + + if (pages == -1) { + /* For global kernel pages we have to flush them in *all* PASIDs + * because that's the only option the hardware gives us. Despite + * the fact that they are actually only accessible through one. */ + if (gl) + desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_ALL_ALL) | QI_EIOTLB_TYPE; + else + desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; + desc.high = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(pages)); + + desc.low = QI_EIOTLB_PASID(svm->pasid) | QI_EIOTLB_DID(sdev->did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE; + desc.high = QI_EIOTLB_ADDR(address) | QI_EIOTLB_GL(gl) | + QI_EIOTLB_IH(ih) | QI_EIOTLB_AM(mask); + } + qi_submit_sync(&desc, svm->iommu); + + if (sdev->dev_iotlb) { + desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) | QI_DEV_EIOTLB_SID(sdev->sid) | + QI_DEV_EIOTLB_QDEP(sdev->qdep) | QI_DEIOTLB_TYPE; + if (pages == -1) { + desc.high = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | QI_DEV_EIOTLB_SIZE; + } else if (pages > 1) { + /* The least significant zero bit indicates the size. So, + * for example, an "address" value of 0x12345f000 will + * flush from 0x123440000 to 0x12347ffff (256KiB). */ + unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); + unsigned long mask = __rounddown_pow_of_two(address ^ last);; + + desc.high = QI_DEV_EIOTLB_ADDR((address & ~mask) | (mask - 1)) | QI_DEV_EIOTLB_SIZE; + } else { + desc.high = QI_DEV_EIOTLB_ADDR(address); + } + qi_submit_sync(&desc, svm->iommu); + } +} + +static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, + unsigned long pages, int ih, int gl) +{ + struct intel_svm_dev *sdev; + + /* Try deferred invalidate if available */ + if (svm->iommu->pasid_state_table && + !cmpxchg64(&svm->iommu->pasid_state_table[svm->pasid].val, 0, 1ULL << 63)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(sdev, &svm->devs, list) + intel_flush_svm_range_dev(svm, sdev, address, pages, ih, gl); + rcu_read_unlock(); +} + +static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long address, pte_t pte) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, address, 1, 1, 0); +} + +static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long address) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, address, 1, 1, 0); +} + +/* Pages have been freed at this point */ +static void intel_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + intel_flush_svm_range(svm, start, + (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0, 0); +} + + +static void intel_flush_pasid_dev(struct intel_svm *svm, struct intel_svm_dev *sdev, int pasid) +{ + struct qi_desc desc; + + desc.high = 0; + desc.low = QI_PC_TYPE | QI_PC_DID(sdev->did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); + + qi_submit_sync(&desc, svm->iommu); +} + +static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); + + svm->iommu->pasid_table[svm->pasid].val = 0; + + /* There's no need to do any flush because we can't get here if there + * are any devices left anyway. */ + WARN_ON(!list_empty(&svm->devs)); +} + +static const struct mmu_notifier_ops intel_mmuops = { + .release = intel_mm_release, + .change_pte = intel_change_pte, + .invalidate_page = intel_invalidate_page, + .invalidate_range = intel_invalidate_range, +}; + +static DEFINE_MUTEX(pasid_mutex); + +int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct intel_svm_dev *sdev; + struct intel_svm *svm = NULL; + struct mm_struct *mm = NULL; + int pasid_max; + int ret; + + if (WARN_ON(!iommu)) + return -EINVAL; + + if (dev_is_pci(dev)) { + pasid_max = pci_max_pasids(to_pci_dev(dev)); + if (pasid_max < 0) + return -EINVAL; + } else + pasid_max = 1 << 20; + + if ((flags & SVM_FLAG_SUPERVISOR_MODE)) { + if (!ecap_srs(iommu->ecap)) + return -EINVAL; + } else if (pasid) { + mm = get_task_mm(current); + BUG_ON(!mm); + } + + mutex_lock(&pasid_mutex); + if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) { + int i; + + idr_for_each_entry(&iommu->pasid_idr, svm, i) { + if (svm->mm != mm || + (svm->flags & SVM_FLAG_PRIVATE_PASID)) + continue; + + if (svm->pasid >= pasid_max) { + dev_warn(dev, + "Limited PASID width. Cannot use existing PASID %d\n", + svm->pasid); + ret = -ENOSPC; + goto out; + } + + list_for_each_entry(sdev, &svm->devs, list) { + if (dev == sdev->dev) { + if (sdev->ops != ops) { + ret = -EBUSY; + goto out; + } + sdev->users++; + goto success; + } + } + + break; + } + } + + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) { + ret = -ENOMEM; + goto out; + } + sdev->dev = dev; + + ret = intel_iommu_enable_pasid(iommu, sdev); + if (ret || !pasid) { + /* If they don't actually want to assign a PASID, this is + * just an enabling check/preparation. */ + kfree(sdev); + goto out; + } + /* Finish the setup now we know we're keeping it */ + sdev->users = 1; + sdev->ops = ops; + init_rcu_head(&sdev->rcu); + + if (!svm) { + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) { + ret = -ENOMEM; + kfree(sdev); + goto out; + } + svm->iommu = iommu; + + if (pasid_max > 2 << ecap_pss(iommu->ecap)) + pasid_max = 2 << ecap_pss(iommu->ecap); + + /* Do not use PASID 0 in caching mode (virtualised IOMMU) */ + ret = idr_alloc(&iommu->pasid_idr, svm, + !!cap_caching_mode(iommu->cap), + pasid_max - 1, GFP_KERNEL); + if (ret < 0) { + kfree(svm); + goto out; + } + svm->pasid = ret; + svm->notifier.ops = &intel_mmuops; + svm->mm = mm; + svm->flags = flags; + INIT_LIST_HEAD_RCU(&svm->devs); + ret = -ENOMEM; + if (mm) { + ret = mmu_notifier_register(&svm->notifier, mm); + if (ret) { + idr_remove(&svm->iommu->pasid_idr, svm->pasid); + kfree(svm); + kfree(sdev); + goto out; + } + iommu->pasid_table[svm->pasid].val = (u64)__pa(mm->pgd) | 1; + mm = NULL; + } else + iommu->pasid_table[svm->pasid].val = (u64)__pa(init_mm.pgd) | 1 | (1ULL << 11); + wmb(); + /* In caching mode, we still have to flush with PASID 0 when + * a PASID table entry becomes present. Not entirely clear + * *why* that would be the case — surely we could just issue + * a flush with the PASID value that we've changed? The PASID + * is the index into the table, after all. It's not like domain + * IDs in the case of the equivalent context-entry change in + * caching mode. And for that matter it's not entirely clear why + * a VMM would be in the business of caching the PASID table + * anyway. Surely that can be left entirely to the guest? */ + if (cap_caching_mode(iommu->cap)) + intel_flush_pasid_dev(svm, sdev, 0); + } + list_add_rcu(&sdev->list, &svm->devs); + + success: + *pasid = svm->pasid; + ret = 0; + out: + mutex_unlock(&pasid_mutex); + if (mm) + mmput(mm); + return ret; +} +EXPORT_SYMBOL_GPL(intel_svm_bind_mm); + +int intel_svm_unbind_mm(struct device *dev, int pasid) +{ + struct intel_svm_dev *sdev; + struct intel_iommu *iommu; + struct intel_svm *svm; + int ret = -EINVAL; + + mutex_lock(&pasid_mutex); + iommu = intel_svm_device_to_iommu(dev); + if (!iommu || !iommu->pasid_table) + goto out; + + svm = idr_find(&iommu->pasid_idr, pasid); + if (!svm) + goto out; + + list_for_each_entry(sdev, &svm->devs, list) { + if (dev == sdev->dev) { + ret = 0; + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + /* Flush the PASID cache and IOTLB for this device. + * Note that we do depend on the hardware *not* using + * the PASID any more. Just as we depend on other + * devices never using PASIDs that they have no right + * to use. We have a *shared* PASID table, because it's + * large and has to be physically contiguous. So it's + * hard to be as defensive as we might like. */ + intel_flush_pasid_dev(svm, sdev, svm->pasid); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm); + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + mmu_notifier_unregister(&svm->notifier, svm->mm); + + idr_remove(&svm->iommu->pasid_idr, svm->pasid); + if (svm->mm) + mmput(svm->mm); + /* We mandate that no page faults may be outstanding + * for the PASID when intel_svm_unbind_mm() is called. + * If that is not obeyed, subtle errors will happen. + * Let's make them less subtle... */ + memset(svm, 0x6b, sizeof(*svm)); + kfree(svm); + } + } + break; + } + } + out: + mutex_unlock(&pasid_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); + +/* Page request queue descriptor */ +struct page_req_dsc { + u64 srr:1; + u64 bof:1; + u64 pasid_present:1; + u64 lpig:1; + u64 pasid:20; + u64 bus:8; + u64 private:23; + u64 prg_index:9; + u64 rd_req:1; + u64 wr_req:1; + u64 exe_req:1; + u64 priv_req:1; + u64 devfn:8; + u64 addr:52; +}; + +#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10) +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct intel_svm *svm = NULL; + int head, tail, handled = 0; + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct intel_svm_dev *sdev; + struct vm_area_struct *vma; + struct page_req_dsc *req; + struct qi_desc resp; + int ret, result; + u64 address; + + handled = 1; + + req = &iommu->prq[head / sizeof(*req)]; + + result = QI_RESP_FAILURE; + address = (u64)req->addr << VTD_PAGE_SHIFT; + if (!req->pasid_present) { + pr_err("%s: Page request without PASID: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + goto bad_req; + } + + if (!svm || svm->pasid != req->pasid) { + rcu_read_lock(); + svm = idr_find(&iommu->pasid_idr, req->pasid); + /* It *can't* go away, because the driver is not permitted + * to unbind the mm while any page faults are outstanding. + * So we only need RCU to protect the internal idr code. */ + rcu_read_unlock(); + + if (!svm) { + pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", + iommu->name, req->pasid, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + goto no_pasid; + } + } + + result = QI_RESP_INVALID; + /* Since we're using init_mm.pgd directly, we should never take + * any faults on kernel addresses. */ + if (!svm->mm) + goto bad_req; + down_read(&svm->mm->mmap_sem); + vma = find_extend_vma(svm->mm, address); + if (!vma || address < vma->vm_start) + goto invalid; + + ret = handle_mm_fault(svm->mm, vma, address, + req->wr_req ? FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) + goto invalid; + + result = QI_RESP_SUCCESS; + invalid: + up_read(&svm->mm->mmap_sem); + bad_req: + /* Accounting for major/minor faults? */ + rcu_read_lock(); + list_for_each_entry_rcu(sdev, &svm->devs, list) { + if (sdev->sid == PCI_DEVID(req->bus, req->devfn)) + break; + } + /* Other devices can go away, but the drivers are not permitted + * to unbind while any page faults might be in flight. So it's + * OK to drop the 'lock' here now we have it. */ + rcu_read_unlock(); + + if (WARN_ON(&sdev->list == &svm->devs)) + sdev = NULL; + + if (sdev && sdev->ops && sdev->ops->fault_cb) { + int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | + (req->exe_req << 1) | (req->priv_req); + sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, req->private, rwxp, result); + } + /* We get here in the error case where the PASID lookup failed, + and these can be NULL. Do not use them below this point! */ + sdev = NULL; + svm = NULL; + no_pasid: + if (req->lpig) { + /* Page Group Response */ + resp.low = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID((req->bus << 8) | req->devfn) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_TYPE; + resp.high = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_PRIV(req->private) | QI_PGRP_RESP_CODE(result); + + qi_submit_sync(&resp, iommu); + } else if (req->srr) { + /* Page Stream Response */ + resp.low = QI_PSTRM_IDX(req->prg_index) | + QI_PSTRM_PRIV(req->private) | QI_PSTRM_BUS(req->bus) | + QI_PSTRM_PASID(req->pasid) | QI_PSTRM_RESP_TYPE; + resp.high = QI_PSTRM_ADDR(address) | QI_PSTRM_DEVFN(req->devfn) | + QI_PSTRM_RESP_CODE(result); + + qi_submit_sync(&resp, iommu); + } + + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + return IRQ_RETVAL(handled); +} diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 7ac17f57250e..187c10299722 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -20,6 +20,14 @@ #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 +/* Extended context entry types */ +#define CONTEXT_TT_PT_PASID 4 +#define CONTEXT_TT_PT_PASID_DEV_IOTLB 5 +#define CONTEXT_TT_MASK (7ULL << 2) + +#define CONTEXT_DINVE (1ULL << 8) +#define CONTEXT_PRS (1ULL << 9) +#define CONTEXT_PASIDE (1ULL << 11) struct intel_iommu; struct dmar_domain; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 6240063bdcac..821273ca4873 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -1,5 +1,9 @@ /* - * Copyright (c) 2006, Intel Corporation. + * Copyright © 2006-2015, Intel Corporation. + * + * Authors: Ashok Raj + * Anil S Keshavamurthy + * David Woodhouse * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -13,10 +17,6 @@ * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Author: Ashok Raj - * Author: Anil S Keshavamurthy */ #ifndef _INTEL_IOMMU_H_ @@ -25,7 +25,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -57,16 +60,21 @@ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ #define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ +#define DMAR_PQH_REG 0xc0 /* Page request queue head register */ +#define DMAR_PQT_REG 0xc8 /* Page request queue tail register */ +#define DMAR_PQA_REG 0xd0 /* Page request queue address register */ +#define DMAR_PRS_REG 0xdc /* Page request status register */ +#define DMAR_PECTL_REG 0xe0 /* Page request event control register */ +#define DMAR_PEDATA_REG 0xe4 /* Page request event interrupt data register */ +#define DMAR_PEADDR_REG 0xe8 /* Page request event interrupt addr register */ +#define DMAR_PEUADDR_REG 0xec /* Page request event Upper address register */ #define OFFSET_STRIDE (9) -/* -#define dmar_readl(dmar, reg) readl(dmar + reg) -#define dmar_readq(dmar, reg) ({ \ - u32 lo, hi; \ - lo = readl(dmar + reg); \ - hi = readl(dmar + reg + 4); \ - (((u64) hi) << 32) + lo; }) -*/ + +#ifdef CONFIG_64BIT +#define dmar_readq(a) readq(a) +#define dmar_writeq(a,v) writeq(v,a) +#else static inline u64 dmar_readq(void __iomem *addr) { u32 lo, hi; @@ -80,6 +88,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) writel((u32)val, addr); writel((u32)(val >> 32), addr + 4); } +#endif #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) #define DMAR_VER_MINOR(v) ((v) & 0x0f) @@ -123,7 +132,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define ecap_srs(e) ((e >> 31) & 0x1) #define ecap_ers(e) ((e >> 30) & 0x1) #define ecap_prs(e) ((e >> 29) & 0x1) -/* PASID support used to be on bit 28 */ +#define ecap_broken_pasid(e) ((e >> 28) & 0x1) #define ecap_dis(e) ((e >> 27) & 0x1) #define ecap_nest(e) ((e >> 26) & 0x1) #define ecap_mts(e) ((e >> 25) & 0x1) @@ -253,6 +262,11 @@ enum { #define QI_DIOTLB_TYPE 0x3 #define QI_IEC_TYPE 0x4 #define QI_IWD_TYPE 0x5 +#define QI_EIOTLB_TYPE 0x6 +#define QI_PC_TYPE 0x7 +#define QI_DEIOTLB_TYPE 0x8 +#define QI_PGRP_RESP_TYPE 0x9 +#define QI_PSTRM_RESP_TYPE 0xa #define QI_IEC_SELECTIVE (((u64)1) << 4) #define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32)) @@ -280,6 +294,53 @@ enum { #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 +#define QI_PC_PASID(pasid) (((u64)pasid) << 32) +#define QI_PC_DID(did) (((u64)did) << 16) +#define QI_PC_GRAN(gran) (((u64)gran) << 4) + +#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) +#define QI_PC_PASID_SEL (QI_PC_TYPE | QI_PC_GRAN(1)) + +#define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_EIOTLB_GL(gl) (((u64)gl) << 7) +#define QI_EIOTLB_IH(ih) (((u64)ih) << 6) +#define QI_EIOTLB_AM(am) (((u64)am)) +#define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) +#define QI_EIOTLB_DID(did) (((u64)did) << 16) +#define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) + +#define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) +#define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) +#define QI_DEV_EIOTLB_GLOB(g) ((u64)g) +#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) +#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) +#define QI_DEV_EIOTLB_QDEP(qd) (((qd) & 0x1f) << 16) +#define QI_DEV_EIOTLB_MAX_INVS 32 + +#define QI_PGRP_IDX(idx) (((u64)(idx)) << 55) +#define QI_PGRP_PRIV(priv) (((u64)(priv)) << 32) +#define QI_PGRP_RESP_CODE(res) ((u64)(res)) +#define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) +#define QI_PGRP_DID(did) (((u64)(did)) << 16) +#define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) + +#define QI_PSTRM_ADDR(addr) (((u64)(addr)) & VTD_PAGE_MASK) +#define QI_PSTRM_DEVFN(devfn) (((u64)(devfn)) << 4) +#define QI_PSTRM_RESP_CODE(res) ((u64)(res)) +#define QI_PSTRM_IDX(idx) (((u64)(idx)) << 55) +#define QI_PSTRM_PRIV(priv) (((u64)(priv)) << 32) +#define QI_PSTRM_BUS(bus) (((u64)(bus)) << 24) +#define QI_PSTRM_PASID(pasid) (((u64)(pasid)) << 4) + +#define QI_RESP_SUCCESS 0x0 +#define QI_RESP_INVALID 0x1 +#define QI_RESP_FAILURE 0xf + +#define QI_GRAN_ALL_ALL 0 +#define QI_GRAN_NONG_ALL 1 +#define QI_GRAN_NONG_PASID 2 +#define QI_GRAN_PSI_PASID 3 + struct qi_desc { u64 low, high; }; @@ -327,6 +388,10 @@ enum { #define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0) #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) +struct pasid_entry; +struct pasid_state_entry; +struct page_req_dsc; + struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u64 reg_phys; /* physical address of hw register set */ @@ -338,7 +403,7 @@ struct intel_iommu { int seq_id; /* sequence id of the iommu */ int agaw; /* agaw of this iommu */ int msagaw; /* max sagaw of this iommu */ - unsigned int irq; + unsigned int irq, pr_irq; u16 segment; /* PCI segment# */ unsigned char name[13]; /* Device Name */ @@ -349,6 +414,18 @@ struct intel_iommu { struct root_entry *root_entry; /* virtual address */ struct iommu_flush flush; +#endif +#ifdef CONFIG_INTEL_IOMMU_SVM + /* These are large and need to be contiguous, so we allocate just + * one for now. We'll maybe want to rethink that if we truly give + * devices away to userspace processes (e.g. for DPDK) and don't + * want to trust that userspace will use *only* the PASID it was + * told to. But while it's all driver-arbitrated, we're fine. */ + struct pasid_entry *pasid_table; + struct pasid_state_entry *pasid_state_table; + struct page_req_dsc *prq; + unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct idr pasid_idr; #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ @@ -389,6 +466,38 @@ extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); +#ifdef CONFIG_INTEL_IOMMU_SVM +extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu); +extern int intel_svm_free_pasid_tables(struct intel_iommu *iommu); +extern int intel_svm_enable_prq(struct intel_iommu *iommu); +extern int intel_svm_finish_prq(struct intel_iommu *iommu); + +struct svm_dev_ops; + +struct intel_svm_dev { + struct list_head list; + struct rcu_head rcu; + struct device *dev; + struct svm_dev_ops *ops; + int users; + u16 did; + u16 dev_iotlb:1; + u16 sid, qdep; +}; + +struct intel_svm { + struct mmu_notifier notifier; + struct mm_struct *mm; + struct intel_iommu *iommu; + int flags; + int pasid; + struct list_head devs; +}; + +extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev); +extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev); +#endif + extern const struct attribute_group *intel_iommu_groups[]; #endif diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h new file mode 100644 index 000000000000..3c25794042f9 --- /dev/null +++ b/include/linux/intel-svm.h @@ -0,0 +1,121 @@ +/* + * Copyright © 2015 Intel Corporation. + * + * Authors: David Woodhouse + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __INTEL_SVM_H__ +#define __INTEL_SVM_H__ + +struct device; + +struct svm_dev_ops { + void (*fault_cb)(struct device *dev, int pasid, u64 address, + u32 private, int rwxp, int response); +}; + +/* Values for rxwp in fault_cb callback */ +#define SVM_REQ_READ (1<<3) +#define SVM_REQ_WRITE (1<<2) +#define SVM_REQ_EXEC (1<<1) +#define SVM_REQ_PRIV (1<<0) + + +/* + * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" + * PASID for the current process. Even if a PASID already exists, a new one + * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID + * will not be given to subsequent callers. This facility allows a driver to + * disambiguate between multiple device contexts which access the same MM, + * if there is no other way to do so. It should be used sparingly, if at all. + */ +#define SVM_FLAG_PRIVATE_PASID (1<<0) + +/* + * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only + * for access to kernel addresses. No IOTLB flushes are automatically done + * for kernel mappings; it is valid only for access to the kernel's static + * 1:1 mapping of physical memory — not to vmalloc or even module mappings. + * A future API addition may permit the use of such ranges, by means of an + * explicit IOTLB flush call (akin to the DMA API's unmap method). + * + * It is unlikely that we will ever hook into flush_tlb_kernel_range() to + * do such IOTLB flushes automatically. + */ +#define SVM_FLAG_SUPERVISOR_MODE (1<<1) + +#ifdef CONFIG_INTEL_IOMMU_SVM + +/** + * intel_svm_bind_mm() - Bind the current process to a PASID + * @dev: Device to be granted acccess + * @pasid: Address for allocated PASID + * @flags: Flags. Later for requesting supervisor mode, etc. + * @ops: Callbacks to device driver + * + * This function attempts to enable PASID support for the given device. + * If the @pasid argument is non-%NULL, a PASID is allocated for access + * to the MM of the current process. + * + * By using a %NULL value for the @pasid argument, this function can + * be used to simply validate that PASID support is available for the + * given device — i.e. that it is behind an IOMMU which has the + * requisite support, and is enabled. + * + * Page faults are handled transparently by the IOMMU code, and there + * should be no need for the device driver to be involved. If a page + * fault cannot be handled (i.e. is an invalid address rather than + * just needs paging in), then the page request will be completed by + * the core IOMMU code with appropriate status, and the device itself + * can then report the resulting fault to its driver via whatever + * mechanism is appropriate. + * + * Multiple calls from the same process may result in the same PASID + * being re-used. A reference count is kept. + */ +extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, + struct svm_dev_ops *ops); + +/** + * intel_svm_unbind_mm() - Unbind a specified PASID + * @dev: Device for which PASID was allocated + * @pasid: PASID value to be unbound + * + * This function allows a PASID to be retired when the device no + * longer requires access to the address space of a given process. + * + * If the use count for the PASID in question reaches zero, the + * PASID is revoked and may no longer be used by hardware. + * + * Device drivers are required to ensure that no access (including + * page requests) is currently outstanding for the PASID in question, + * before calling this function. + */ +extern int intel_svm_unbind_mm(struct device *dev, int pasid); + +#else /* CONFIG_INTEL_IOMMU_SVM */ + +static inline int intel_svm_bind_mm(struct device *dev, int *pasid, + int flags, struct svm_dev_ops *ops) +{ + return -ENOSYS; +} + +static inline int intel_svm_unbind_mm(struct device *dev, int pasid) +{ + BUG(); +} +#endif /* CONFIG_INTEL_IOMMU_SVM */ + +#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) + +#endif /* __INTEL_SVM_H__ */