diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 92ac995dd9c6..6e8145c36e93 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -222,7 +222,7 @@ void panfrost_mmu_unmap(struct panfrost_gem_object *bo) size_t unmapped_page; size_t pgsize = get_pgsize(iova, len - unmapped_len); - unmapped_page = ops->unmap(ops, iova, pgsize); + unmapped_page = ops->unmap(ops, iova, pgsize, NULL); if (!unmapped_page) break; @@ -247,20 +247,28 @@ static void mmu_tlb_inv_context_s1(void *cookie) mmu_hw_do_operation(pfdev, 0, 0, ~0UL, AS_COMMAND_FLUSH_MEM); } -static void mmu_tlb_inv_range_nosync(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) -{} - static void mmu_tlb_sync_context(void *cookie) { //struct panfrost_device *pfdev = cookie; // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X } -static const struct iommu_gather_ops mmu_tlb_ops = { +static void mmu_tlb_flush_walk(unsigned long iova, size_t size, size_t granule, + void *cookie) +{ + mmu_tlb_sync_context(cookie); +} + +static void mmu_tlb_flush_leaf(unsigned long iova, size_t size, size_t granule, + void *cookie) +{ + mmu_tlb_sync_context(cookie); +} + +static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_all = mmu_tlb_inv_context_s1, - .tlb_add_flush = mmu_tlb_inv_range_nosync, - .tlb_sync = mmu_tlb_sync_context, + .tlb_flush_walk = mmu_tlb_flush_walk, + .tlb_flush_leaf = mmu_tlb_flush_leaf, }; static const char *access_type_name(struct panfrost_device *pfdev, diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index b607a92791d3..29eeea914660 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3055,7 +3055,8 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, } static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - size_t page_size) + size_t page_size, + struct iommu_iotlb_gather *gather) { struct protection_domain *domain = to_pdomain(dom); size_t unmap_size; @@ -3196,9 +3197,10 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) domain_flush_complete(dom); } -static void amd_iommu_iotlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { + amd_iommu_flush_iotlb_all(domain); } const struct iommu_ops amd_iommu_ops = { @@ -3219,8 +3221,7 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .pgsize_bitmap = AMD_IOMMU_PGSIZES, .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_range_add = amd_iommu_iotlb_range_add, - .iotlb_sync = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, }; /***************************************************************************** diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index a9a9fabd3968..b8049ea2e455 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -181,12 +181,13 @@ #define ARM_SMMU_MEMATTR_DEVICE_nGnRE 0x1 #define ARM_SMMU_MEMATTR_OIWB 0xf -#define Q_IDX(q, p) ((p) & ((1 << (q)->max_n_shift) - 1)) -#define Q_WRP(q, p) ((p) & (1 << (q)->max_n_shift)) -#define Q_OVERFLOW_FLAG (1 << 31) -#define Q_OVF(q, p) ((p) & Q_OVERFLOW_FLAG) +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) +#define Q_WRP(llq, p) ((p) & (1 << (llq)->max_n_shift)) +#define Q_OVERFLOW_FLAG (1U << 31) +#define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG) #define Q_ENT(q, p) ((q)->base + \ - Q_IDX(q, p) * (q)->ent_dwords) + Q_IDX(&((q)->llq), p) * \ + (q)->ent_dwords) #define Q_BASE_RWA (1UL << 62) #define Q_BASE_ADDR_MASK GENMASK_ULL(51, 5) @@ -306,6 +307,15 @@ #define CMDQ_ERR_CERROR_ABT_IDX 2 #define CMDQ_ERR_CERROR_ATC_INV_IDX 3 +#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG + +/* + * This is used to size the command queue and therefore must be at least + * BITS_PER_LONG so that the valid_map works correctly (it relies on the + * total number of queue entries being a multiple of BITS_PER_LONG). + */ +#define CMDQ_BATCH_ENTRIES BITS_PER_LONG + #define CMDQ_0_OP GENMASK_ULL(7, 0) #define CMDQ_0_SSV (1UL << 11) @@ -368,9 +378,8 @@ #define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) /* High-level queue structures */ -#define ARM_SMMU_POLL_TIMEOUT_US 100 -#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ -#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10 +#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */ +#define ARM_SMMU_POLL_SPIN_COUNT 10 #define MSI_IOVA_BASE 0x8000000 #define MSI_IOVA_LENGTH 0x100000 @@ -472,13 +481,29 @@ struct arm_smmu_cmdq_ent { #define CMDQ_OP_CMD_SYNC 0x46 struct { - u32 msidata; u64 msiaddr; } sync; }; }; +struct arm_smmu_ll_queue { + union { + u64 val; + struct { + u32 prod; + u32 cons; + }; + struct { + atomic_t prod; + atomic_t cons; + } atomic; + u8 __pad[SMP_CACHE_BYTES]; + } ____cacheline_aligned_in_smp; + u32 max_n_shift; +}; + struct arm_smmu_queue { + struct arm_smmu_ll_queue llq; int irq; /* Wired interrupt */ __le64 *base; @@ -486,17 +511,23 @@ struct arm_smmu_queue { u64 q_base; size_t ent_dwords; - u32 max_n_shift; - u32 prod; - u32 cons; u32 __iomem *prod_reg; u32 __iomem *cons_reg; }; +struct arm_smmu_queue_poll { + ktime_t timeout; + unsigned int delay; + unsigned int spin_cnt; + bool wfe; +}; + struct arm_smmu_cmdq { struct arm_smmu_queue q; - spinlock_t lock; + atomic_long_t *valid_map; + atomic_t owner_prod; + atomic_t lock; }; struct arm_smmu_evtq { @@ -576,8 +607,6 @@ struct arm_smmu_device { int gerr_irq; int combined_irq; - u32 sync_nr; - u8 prev_cmd_opcode; unsigned long ias; /* IPA */ unsigned long oas; /* PA */ @@ -596,12 +625,6 @@ struct arm_smmu_device { struct arm_smmu_strtab_cfg strtab_cfg; - /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */ - union { - u32 sync_count; - u64 padding; - }; - /* IOMMU core code handle */ struct iommu_device iommu; }; @@ -614,7 +637,7 @@ struct arm_smmu_master { struct list_head domain_head; u32 *sids; unsigned int num_sids; - bool ats_enabled :1; + bool ats_enabled; }; /* SMMU private data for an IOMMU domain */ @@ -631,6 +654,7 @@ struct arm_smmu_domain { struct io_pgtable_ops *pgtbl_ops; bool non_strict; + atomic_t nr_ats_masters; enum arm_smmu_domain_stage stage; union { @@ -685,85 +709,97 @@ static void parse_driver_options(struct arm_smmu_device *smmu) } /* Low-level queue manipulation functions */ -static bool queue_full(struct arm_smmu_queue *q) +static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) +{ + u32 space, prod, cons; + + prod = Q_IDX(q, q->prod); + cons = Q_IDX(q, q->cons); + + if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) + space = (1 << q->max_n_shift) - (prod - cons); + else + space = cons - prod; + + return space >= n; +} + +static bool queue_full(struct arm_smmu_ll_queue *q) { return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && Q_WRP(q, q->prod) != Q_WRP(q, q->cons); } -static bool queue_empty(struct arm_smmu_queue *q) +static bool queue_empty(struct arm_smmu_ll_queue *q) { return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && Q_WRP(q, q->prod) == Q_WRP(q, q->cons); } -static void queue_sync_cons(struct arm_smmu_queue *q) +static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod) { - q->cons = readl_relaxed(q->cons_reg); + return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || + ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && + (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); } -static void queue_inc_cons(struct arm_smmu_queue *q) +static void queue_sync_cons_out(struct arm_smmu_queue *q) { - u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1; - - q->cons = Q_OVF(q, q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); - /* * Ensure that all CPU accesses (reads and writes) to the queue * are complete before we update the cons pointer. */ mb(); - writel_relaxed(q->cons, q->cons_reg); + writel_relaxed(q->llq.cons, q->cons_reg); } -static int queue_sync_prod(struct arm_smmu_queue *q) +static void queue_inc_cons(struct arm_smmu_ll_queue *q) +{ + u32 cons = (Q_WRP(q, q->cons) | Q_IDX(q, q->cons)) + 1; + q->cons = Q_OVF(q->cons) | Q_WRP(q, cons) | Q_IDX(q, cons); +} + +static int queue_sync_prod_in(struct arm_smmu_queue *q) { int ret = 0; u32 prod = readl_relaxed(q->prod_reg); - if (Q_OVF(q, prod) != Q_OVF(q, q->prod)) + if (Q_OVF(prod) != Q_OVF(q->llq.prod)) ret = -EOVERFLOW; - q->prod = prod; + q->llq.prod = prod; return ret; } -static void queue_inc_prod(struct arm_smmu_queue *q) +static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n) { - u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; - - q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); - writel(q->prod, q->prod_reg); + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; + return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); } -/* - * Wait for the SMMU to consume items. If sync is true, wait until the queue - * is empty. Otherwise, wait until there is at least one free slot. - */ -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe) +static void queue_poll_init(struct arm_smmu_device *smmu, + struct arm_smmu_queue_poll *qp) { - ktime_t timeout; - unsigned int delay = 1, spin_cnt = 0; + qp->delay = 1; + qp->spin_cnt = 0; + qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); +} - /* Wait longer if it's a CMD_SYNC */ - timeout = ktime_add_us(ktime_get(), sync ? - ARM_SMMU_CMDQ_SYNC_TIMEOUT_US : - ARM_SMMU_POLL_TIMEOUT_US); +static int queue_poll(struct arm_smmu_queue_poll *qp) +{ + if (ktime_compare(ktime_get(), qp->timeout) > 0) + return -ETIMEDOUT; - while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) { - if (ktime_compare(ktime_get(), timeout) > 0) - return -ETIMEDOUT; - - if (wfe) { - wfe(); - } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { - cpu_relax(); - continue; - } else { - udelay(delay); - delay *= 2; - spin_cnt = 0; - } + if (qp->wfe) { + wfe(); + } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) { + cpu_relax(); + } else { + udelay(qp->delay); + qp->delay *= 2; + qp->spin_cnt = 0; } return 0; @@ -777,16 +813,6 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords) *dst++ = cpu_to_le64(*src++); } -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) -{ - if (queue_full(q)) - return -ENOSPC; - - queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords); - queue_inc_prod(q); - return 0; -} - static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) { int i; @@ -797,11 +823,12 @@ static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent) { - if (queue_empty(q)) + if (queue_empty(&q->llq)) return -EAGAIN; - queue_read(ent, Q_ENT(q, q->cons), q->ent_dwords); - queue_inc_cons(q); + queue_read(ent, Q_ENT(q, q->llq.cons), q->ent_dwords); + queue_inc_cons(&q->llq); + queue_sync_cons_out(q); return 0; } @@ -868,20 +895,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); break; case CMDQ_OP_CMD_SYNC: - if (ent->sync.msiaddr) + if (ent->sync.msiaddr) { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_IRQ); - else + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; + } else { cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV); + } cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB); - /* - * Commands are written little-endian, but we want the SMMU to - * receive MSIData, and thus write it back to memory, in CPU - * byte order, so big-endian needs an extra byteswap here. - */ - cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, - cpu_to_le32(ent->sync.msidata)); - cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; break; default: return -ENOENT; @@ -890,6 +911,27 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } +static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, + u32 prod) +{ + struct arm_smmu_queue *q = &smmu->cmdq.q; + struct arm_smmu_cmdq_ent ent = { + .opcode = CMDQ_OP_CMD_SYNC, + }; + + /* + * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI + * payload, so the write will zero the entire command on that platform. + */ + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) { + ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) * + q->ent_dwords * 8; + } + + arm_smmu_cmdq_build_cmd(cmd, &ent); +} + static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) { static const char *cerror_str[] = { @@ -948,109 +990,456 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) +/* + * Command queue locking. + * This is a form of bastardised rwlock with the following major changes: + * + * - The only LOCK routines are exclusive_trylock() and shared_lock(). + * Neither have barrier semantics, and instead provide only a control + * dependency. + * + * - The UNLOCK routines are supplemented with shared_tryunlock(), which + * fails if the caller appears to be the last lock holder (yes, this is + * racy). All successful UNLOCK routines have RELEASE semantics. + */ +static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) { - struct arm_smmu_queue *q = &smmu->cmdq.q; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); + int val; - smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); + /* + * We can try to avoid the cmpxchg() loop by simply incrementing the + * lock counter. When held in exclusive state, the lock counter is set + * to INT_MIN so these increments won't hurt as the value will remain + * negative. + */ + if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) + return; - while (queue_insert_raw(q, cmd) == -ENOSPC) { - if (queue_poll_cons(q, false, wfe)) - dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + do { + val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); + } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); +} + +static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) +{ + (void)atomic_dec_return_release(&cmdq->lock); +} + +static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) +{ + if (atomic_read(&cmdq->lock) == 1) + return false; + + arm_smmu_cmdq_shared_unlock(cmdq); + return true; +} + +#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \ +({ \ + bool __ret; \ + local_irq_save(flags); \ + __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \ + if (!__ret) \ + local_irq_restore(flags); \ + __ret; \ +}) + +#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \ +({ \ + atomic_set_release(&cmdq->lock, 0); \ + local_irq_restore(flags); \ +}) + + +/* + * Command queue insertion. + * This is made fiddly by our attempts to achieve some sort of scalability + * since there is one queue shared amongst all of the CPUs in the system. If + * you like mixed-size concurrency, dependency ordering and relaxed atomics, + * then you'll *love* this monstrosity. + * + * The basic idea is to split the queue up into ranges of commands that are + * owned by a given CPU; the owner may not have written all of the commands + * itself, but is responsible for advancing the hardware prod pointer when + * the time comes. The algorithm is roughly: + * + * 1. Allocate some space in the queue. At this point we also discover + * whether the head of the queue is currently owned by another CPU, + * or whether we are the owner. + * + * 2. Write our commands into our allocated slots in the queue. + * + * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map. + * + * 4. If we are an owner: + * a. Wait for the previous owner to finish. + * b. Mark the queue head as unowned, which tells us the range + * that we are responsible for publishing. + * c. Wait for all commands in our owned range to become valid. + * d. Advance the hardware prod pointer. + * e. Tell the next owner we've finished. + * + * 5. If we are inserting a CMD_SYNC (we may or may not have been an + * owner), then we need to stick around until it has completed: + * a. If we have MSIs, the SMMU can write back into the CMD_SYNC + * to clear the first 4 bytes. + * b. Otherwise, we spin waiting for the hardware cons pointer to + * advance past our command. + * + * The devil is in the details, particularly the use of locking for handling + * SYNC completion and freeing up space in the queue before we think that it is + * full. + */ +static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod, bool set) +{ + u32 swidx, sbidx, ewidx, ebidx; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = sprod, + }; + + ewidx = BIT_WORD(Q_IDX(&llq, eprod)); + ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; + + while (llq.prod != eprod) { + unsigned long mask; + atomic_long_t *ptr; + u32 limit = BITS_PER_LONG; + + swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); + sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; + + ptr = &cmdq->valid_map[swidx]; + + if ((swidx == ewidx) && (sbidx < ebidx)) + limit = ebidx; + + mask = GENMASK(limit - 1, sbidx); + + /* + * The valid bit is the inverse of the wrap bit. This means + * that a zero-initialised queue is invalid and, after marking + * all entries as valid, they become invalid again when we + * wrap. + */ + if (set) { + atomic_long_xor(mask, ptr); + } else { /* Poll */ + unsigned long valid; + + valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; + atomic_long_cond_read_relaxed(ptr, (VAL & mask) == valid); + } + + llq.prod = queue_inc_prod_n(&llq, limit - sbidx); } } -static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +/* Mark all entries in the range [sprod, eprod) as valid */ +static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true); +} + +/* Wait for all entries in the range [sprod, eprod) to become valid */ +static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, + u32 sprod, u32 eprod) +{ + __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false); +} + +/* Wait for the command queue to become non-full */ +static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + unsigned long flags; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + int ret = 0; + + /* + * Try to update our copy of cons by grabbing exclusive cmdq access. If + * that fails, spin until somebody else updates it for us. + */ + if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) { + WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg)); + arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags); + llq->val = READ_ONCE(cmdq->q.llq.val); + return 0; + } + + queue_poll_init(smmu, &qp); + do { + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + if (!queue_full(llq)) + break; + + ret = queue_poll(&qp); + } while (!ret); + + return ret; +} + +/* + * Wait until the SMMU signals a CMD_SYNC completion MSI. + * Must be called with the cmdq lock held in some capacity. + */ +static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + int ret = 0; + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); + + queue_poll_init(smmu, &qp); + + /* + * The MSI won't generate an event, since it's being written back + * into the command queue. + */ + qp.wfe = false; + smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp))); + llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1); + return ret; +} + +/* + * Wait until the SMMU cons index passes llq->prod. + * Must be called with the cmdq lock held in some capacity. + */ +static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + struct arm_smmu_queue_poll qp; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u32 prod = llq->prod; + int ret = 0; + + queue_poll_init(smmu, &qp); + llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + do { + if (queue_consumed(llq, prod)) + break; + + ret = queue_poll(&qp); + + /* + * This needs to be a readl() so that our subsequent call + * to arm_smmu_cmdq_shared_tryunlock() can fail accurately. + * + * Specifically, we need to ensure that we observe all + * shared_lock()s by other CMD_SYNCs that share our owner, + * so that a failing call to tryunlock() means that we're + * the last one out and therefore we can safely advance + * cmdq->q.llq.cons. Roughly speaking: + * + * CPU 0 CPU1 CPU2 (us) + * + * if (sync) + * shared_lock(); + * + * dma_wmb(); + * set_valid_map(); + * + * if (owner) { + * poll_valid_map(); + * + * writel(prod_reg); + * + * readl(cons_reg); + * tryunlock(); + * + * Requires us to see CPU 0's shared_lock() acquisition. + */ + llq->cons = readl(cmdq->q.cons_reg); + } while (!ret); + + return ret; +} + +static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, + struct arm_smmu_ll_queue *llq) +{ + if (smmu->features & ARM_SMMU_FEAT_MSI && + smmu->features & ARM_SMMU_FEAT_COHERENCY) + return __arm_smmu_cmdq_poll_until_msi(smmu, llq); + + return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); +} + +static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, + u32 prod, int n) +{ + int i; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + .prod = prod, + }; + + for (i = 0; i < n; ++i) { + u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS]; + + prod = queue_inc_prod_n(&llq, i); + queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS); + } +} + +/* + * This is the actual insertion function, and provides the following + * ordering guarantees to callers: + * + * - There is a dma_wmb() before publishing any commands to the queue. + * This can be relied upon to order prior writes to data structures + * in memory (such as a CD or an STE) before the command. + * + * - On completion of a CMD_SYNC, there is a control dependency. + * This can be relied upon to order subsequent writes to memory (e.g. + * freeing an IOVA) after completion of the CMD_SYNC. + * + * - Command insertion is totally ordered, so if two CPUs each race to + * insert their own list of commands then all of the commands from one + * CPU will appear before any of the commands from the other CPU. + */ +static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + u64 *cmds, int n, bool sync) +{ + u64 cmd_sync[CMDQ_ENT_DWORDS]; + u32 prod; + unsigned long flags; + bool owner; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_ll_queue llq = { + .max_n_shift = cmdq->q.llq.max_n_shift, + }, head = llq; + int ret = 0; + + /* 1. Allocate some space in the queue */ + local_irq_save(flags); + llq.val = READ_ONCE(cmdq->q.llq.val); + do { + u64 old; + + while (!queue_has_space(&llq, n + sync)) { + local_irq_restore(flags); + if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); + local_irq_save(flags); + } + + head.cons = llq.cons; + head.prod = queue_inc_prod_n(&llq, n + sync) | + CMDQ_PROD_OWNED_FLAG; + + old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); + if (old == llq.val) + break; + + llq.val = old; + } while (1); + owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); + head.prod &= ~CMDQ_PROD_OWNED_FLAG; + llq.prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * 2. Write our commands into the queue + * Dependency ordering from the cmpxchg() loop above. + */ + arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); + if (sync) { + prod = queue_inc_prod_n(&llq, n); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); + + /* + * In order to determine completion of our CMD_SYNC, we must + * ensure that the queue can't wrap twice without us noticing. + * We achieve that by taking the cmdq lock as shared before + * marking our slot as valid. + */ + arm_smmu_cmdq_shared_lock(cmdq); + } + + /* 3. Mark our slots as valid, ensuring commands are visible first */ + dma_wmb(); + arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod); + + /* 4. If we are the owner, take control of the SMMU hardware */ + if (owner) { + /* a. Wait for previous owner to finish */ + atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod); + + /* b. Stop gathering work by clearing the owned flag */ + prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG, + &cmdq->q.llq.atomic.prod); + prod &= ~CMDQ_PROD_OWNED_FLAG; + + /* + * c. Wait for any gathered work to be written to the queue. + * Note that we read our own entries so that we have the control + * dependency required by (d). + */ + arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod); + + /* + * d. Advance the hardware prod pointer + * Control dependency ordering from the entries becoming valid. + */ + writel_relaxed(prod, cmdq->q.prod_reg); + + /* + * e. Tell the next owner we're done + * Make sure we've updated the hardware first, so that we don't + * race to update prod and potentially move it backwards. + */ + atomic_set_release(&cmdq->owner_prod, prod); + } + + /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ + if (sync) { + llq.prod = queue_inc_prod_n(&llq, n); + ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + if (ret) { + dev_err_ratelimited(smmu->dev, + "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", + llq.prod, + readl_relaxed(cmdq->q.prod_reg), + readl_relaxed(cmdq->q.cons_reg)); + } + + /* + * Try to unlock the cmq lock. This will fail if we're the last + * reader, in which case we can safely update cmdq->q.llq.cons + */ + if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) { + WRITE_ONCE(cmdq->q.llq.cons, llq.cons); + arm_smmu_cmdq_shared_unlock(cmdq); + } + } + + local_irq_restore(flags); + return ret; +} + +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; if (arm_smmu_cmdq_build_cmd(cmd, ent)) { dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", ent->opcode); - return; + return -EINVAL; } - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); -} - -/* - * The difference between val and sync_idx is bounded by the maximum size of - * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. - */ -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) -{ - ktime_t timeout; - u32 val; - - timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); - val = smp_cond_load_acquire(&smmu->sync_count, - (int)(VAL - sync_idx) >= 0 || - !ktime_before(ktime_get(), timeout)); - - return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; -} - -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) -{ - u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; - struct arm_smmu_cmdq_ent ent = { - .opcode = CMDQ_OP_CMD_SYNC, - .sync = { - .msiaddr = virt_to_phys(&smmu->sync_count), - }, - }; - - spin_lock_irqsave(&smmu->cmdq.lock, flags); - - /* Piggy-back on the previous command if it's a SYNC */ - if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) { - ent.sync.msidata = smmu->sync_nr; - } else { - ent.sync.msidata = ++smmu->sync_nr; - arm_smmu_cmdq_build_cmd(cmd, &ent); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - } - - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); - - return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); -} - -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) -{ - u64 cmd[CMDQ_ENT_DWORDS]; - unsigned long flags; - bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); - struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; - int ret; - - arm_smmu_cmdq_build_cmd(cmd, &ent); - - spin_lock_irqsave(&smmu->cmdq.lock, flags); - arm_smmu_cmdq_insert_cmd(smmu, cmd); - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); - spin_unlock_irqrestore(&smmu->cmdq.lock, flags); - - return ret; + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); } static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) { - int ret; - bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && - (smmu->features & ARM_SMMU_FEAT_COHERENCY); - - ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) - : __arm_smmu_cmdq_issue_sync(smmu); - if (ret) - dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); - return ret; + return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); } /* Context descriptor manipulation functions */ @@ -1305,6 +1694,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) int i; struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->evtq.q; + struct arm_smmu_ll_queue *llq = &q->llq; u64 evt[EVTQ_ENT_DWORDS]; do { @@ -1322,12 +1712,13 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) * Not much we can do on overflow, so scream and pretend we're * trying harder. */ - if (queue_sync_prod(q) == -EOVERFLOW) + if (queue_sync_prod_in(q) == -EOVERFLOW) dev_err(smmu->dev, "EVTQ overflow detected -- events lost\n"); - } while (!queue_empty(q)); + } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); return IRQ_HANDLED; } @@ -1373,19 +1764,21 @@ static irqreturn_t arm_smmu_priq_thread(int irq, void *dev) { struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->priq.q; + struct arm_smmu_ll_queue *llq = &q->llq; u64 evt[PRIQ_ENT_DWORDS]; do { while (!queue_remove_raw(q, evt)) arm_smmu_handle_ppr(smmu, evt); - if (queue_sync_prod(q) == -EOVERFLOW) + if (queue_sync_prod_in(q) == -EOVERFLOW) dev_err(smmu->dev, "PRIQ overflow detected -- requests lost\n"); - } while (!queue_empty(q)); + } while (!queue_empty(llq)); /* Sync our overflow flag, as we believe we're up to speed */ - q->cons = Q_OVF(q, q->prod) | Q_WRP(q, q->cons) | Q_IDX(q, q->cons); - writel(q->cons, q->cons_reg); + llq->cons = Q_OVF(llq->prod) | Q_WRP(llq, llq->cons) | + Q_IDX(llq, llq->cons); + queue_sync_cons_out(q); return IRQ_HANDLED; } @@ -1534,6 +1927,23 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) return 0; + /* + * Ensure that we've completed prior invalidation of the main TLBs + * before we read 'nr_ats_masters' in case of a concurrent call to + * arm_smmu_enable_ats(): + * + * // unmap() // arm_smmu_enable_ats() + * TLBI+SYNC atomic_inc(&nr_ats_masters); + * smp_mb(); [...] + * atomic_read(&nr_ats_masters); pci_enable_ats() // writel() + * + * Ensures that we always see the incremented 'nr_ats_masters' count if + * ATS was enabled at the PCI device before completion of the TLBI. + */ + smp_mb(); + if (!atomic_read(&smmu_domain->nr_ats_masters)) + return 0; + arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); spin_lock_irqsave(&smmu_domain->devices_lock, flags); @@ -1545,13 +1955,6 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, } /* IO_PGTABLE API */ -static void arm_smmu_tlb_sync(void *cookie) -{ - struct arm_smmu_domain *smmu_domain = cookie; - - arm_smmu_cmdq_issue_sync(smmu_domain->smmu); -} - static void arm_smmu_tlb_inv_context(void *cookie) { struct arm_smmu_domain *smmu_domain = cookie; @@ -1570,25 +1973,32 @@ static void arm_smmu_tlb_inv_context(void *cookie) /* * NOTE: when io-pgtable is in non-strict mode, we may get here with * PTEs previously cleared by unmaps on the current CPU not yet visible - * to the SMMU. We are relying on the DSB implicit in queue_inc_prod() - * to guarantee those are observed before the TLBI. Do be careful, 007. + * to the SMMU. We are relying on the dma_wmb() implicit during cmd + * insertion to guarantee those are observed before the TLBI. Do be + * careful, 007. */ arm_smmu_cmdq_issue_cmd(smmu, &cmd); arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); } -static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size, + size_t granule, bool leaf, + struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_domain *smmu_domain = cookie; + u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS]; struct arm_smmu_device *smmu = smmu_domain->smmu; + unsigned long start = iova, end = iova + size; + int i = 0; struct arm_smmu_cmdq_ent cmd = { .tlbi = { .leaf = leaf, - .addr = iova, }, }; + if (!size) + return; + if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { cmd.opcode = CMDQ_OP_TLBI_NH_VA; cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; @@ -1597,16 +2007,54 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; } - do { - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - cmd.tlbi.addr += granule; - } while (size -= granule); + while (iova < end) { + if (i == CMDQ_BATCH_ENTRIES) { + arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false); + i = 0; + } + + cmd.tlbi.addr = iova; + arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd); + iova += granule; + i++; + } + + arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true); + + /* + * Unfortunately, this can't be leaf-only since we may have + * zapped an entire table. + */ + arm_smmu_atc_inv_domain(smmu_domain, 0, start, size); } -static const struct iommu_gather_ops arm_smmu_gather_ops = { +static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + struct iommu_domain *domain = &smmu_domain->domain; + + iommu_iotlb_gather_add_page(domain, gather, iova, granule); +} + +static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + arm_smmu_tlb_inv_range(iova, size, granule, false, cookie); +} + +static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + arm_smmu_tlb_inv_range(iova, size, granule, true, cookie); +} + +static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_flush_all = arm_smmu_tlb_inv_context, - .tlb_add_flush = arm_smmu_tlb_inv_range_nosync, - .tlb_sync = arm_smmu_tlb_sync, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_inv_page_nosync, }; /* IOMMU API */ @@ -1796,7 +2244,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) .ias = ias, .oas = oas, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, - .tlb = &arm_smmu_gather_ops, + .tlb = &arm_smmu_flush_ops, .iommu_dev = smmu->dev, }; @@ -1863,44 +2311,58 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) } } -static int arm_smmu_enable_ats(struct arm_smmu_master *master) +static bool arm_smmu_ats_supported(struct arm_smmu_master *master) { - int ret; - size_t stu; struct pci_dev *pdev; struct arm_smmu_device *smmu = master->smmu; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) || !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled()) - return -ENXIO; + return false; pdev = to_pci_dev(master->dev); - if (pdev->untrusted) - return -EPERM; + return !pdev->untrusted && pdev->ats_cap; +} + +static void arm_smmu_enable_ats(struct arm_smmu_master *master) +{ + size_t stu; + struct pci_dev *pdev; + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_domain *smmu_domain = master->domain; + + /* Don't enable ATS at the endpoint if it's not enabled in the STE */ + if (!master->ats_enabled) + return; /* Smallest Translation Unit: log2 of the smallest supported granule */ stu = __ffs(smmu->pgsize_bitmap); + pdev = to_pci_dev(master->dev); - ret = pci_enable_ats(pdev, stu); - if (ret) - return ret; - - master->ats_enabled = true; - return 0; + atomic_inc(&smmu_domain->nr_ats_masters); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); + if (pci_enable_ats(pdev, stu)) + dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); } static void arm_smmu_disable_ats(struct arm_smmu_master *master) { struct arm_smmu_cmdq_ent cmd; + struct arm_smmu_domain *smmu_domain = master->domain; - if (!master->ats_enabled || !dev_is_pci(master->dev)) + if (!master->ats_enabled) return; + pci_disable_ats(to_pci_dev(master->dev)); + /* + * Ensure ATS is disabled at the endpoint before we issue the + * ATC invalidation via the SMMU. + */ + wmb(); arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); arm_smmu_atc_inv_master(master, &cmd); - pci_disable_ats(to_pci_dev(master->dev)); - master->ats_enabled = false; + atomic_dec(&smmu_domain->nr_ats_masters); } static void arm_smmu_detach_dev(struct arm_smmu_master *master) @@ -1911,14 +2373,15 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master) if (!smmu_domain) return; + arm_smmu_disable_ats(master); + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_del(&master->domain_head); spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); master->domain = NULL; + master->ats_enabled = false; arm_smmu_install_ste_for_dev(master); - - arm_smmu_disable_ats(master); } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -1958,17 +2421,20 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) master->domain = smmu_domain; - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_add(&master->domain_head, &smmu_domain->devices); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS) - arm_smmu_enable_ats(master); + master->ats_enabled = arm_smmu_ats_supported(master); if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) arm_smmu_write_ctx_desc(smmu, &smmu_domain->s1_cfg); arm_smmu_install_ste_for_dev(master); + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_add(&master->domain_head, &smmu_domain->devices); + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + arm_smmu_enable_ats(master); + out_unlock: mutex_unlock(&smmu_domain->init_mutex); return ret; @@ -1985,21 +2451,16 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ops->map(ops, iova, paddr, size, prot); } -static size_t -arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) +static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) { - int ret; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; if (!ops) return 0; - ret = ops->unmap(ops, iova, size); - if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size)) - return 0; - - return ret; + return ops->unmap(ops, iova, size, gather); } static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) @@ -2010,12 +2471,13 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) arm_smmu_tlb_inv_context(smmu_domain); } -static void arm_smmu_iotlb_sync(struct iommu_domain *domain) +static void arm_smmu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - if (smmu) - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start, + gather->pgsize, true, smmu_domain); } static phys_addr_t @@ -2286,13 +2748,13 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, size_t qsz; do { - qsz = ((1 << q->max_n_shift) * dwords) << 3; + qsz = ((1 << q->llq.max_n_shift) * dwords) << 3; q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL); if (q->base || qsz < PAGE_SIZE) break; - q->max_n_shift--; + q->llq.max_n_shift--; } while (1); if (!q->base) { @@ -2304,7 +2766,7 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, if (!WARN_ON(q->base_dma & (qsz - 1))) { dev_info(smmu->dev, "allocated %u entries for %s\n", - 1 << q->max_n_shift, name); + 1 << q->llq.max_n_shift, name); } q->prod_reg = arm_smmu_page1_fixup(prod_off, smmu); @@ -2313,24 +2775,55 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, q->q_base = Q_BASE_RWA; q->q_base |= q->base_dma & Q_BASE_ADDR_MASK; - q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->max_n_shift); + q->q_base |= FIELD_PREP(Q_BASE_LOG2SIZE, q->llq.max_n_shift); - q->prod = q->cons = 0; + q->llq.prod = q->llq.cons = 0; return 0; } +static void arm_smmu_cmdq_free_bitmap(void *data) +{ + unsigned long *bitmap = data; + bitmap_free(bitmap); +} + +static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) +{ + int ret = 0; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + unsigned int nents = 1 << cmdq->q.llq.max_n_shift; + atomic_long_t *bitmap; + + atomic_set(&cmdq->owner_prod, 0); + atomic_set(&cmdq->lock, 0); + + bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); + if (!bitmap) { + dev_err(smmu->dev, "failed to allocate cmdq bitmap\n"); + ret = -ENOMEM; + } else { + cmdq->valid_map = bitmap; + devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap); + } + + return ret; +} + static int arm_smmu_init_queues(struct arm_smmu_device *smmu) { int ret; /* cmdq */ - spin_lock_init(&smmu->cmdq.lock); ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, "cmdq"); if (ret) return ret; + ret = arm_smmu_cmdq_init(smmu); + if (ret) + return ret; + /* evtq */ ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, @@ -2708,8 +3201,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Command queue */ writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE); - writel_relaxed(smmu->cmdq.q.prod, smmu->base + ARM_SMMU_CMDQ_PROD); - writel_relaxed(smmu->cmdq.q.cons, smmu->base + ARM_SMMU_CMDQ_CONS); + writel_relaxed(smmu->cmdq.q.llq.prod, smmu->base + ARM_SMMU_CMDQ_PROD); + writel_relaxed(smmu->cmdq.q.llq.cons, smmu->base + ARM_SMMU_CMDQ_CONS); enables = CR0_CMDQEN; ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0, @@ -2736,9 +3229,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Event queue */ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); - writel_relaxed(smmu->evtq.q.prod, + writel_relaxed(smmu->evtq.q.llq.prod, arm_smmu_page1_fixup(ARM_SMMU_EVTQ_PROD, smmu)); - writel_relaxed(smmu->evtq.q.cons, + writel_relaxed(smmu->evtq.q.llq.cons, arm_smmu_page1_fixup(ARM_SMMU_EVTQ_CONS, smmu)); enables |= CR0_EVTQEN; @@ -2753,9 +3246,9 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) if (smmu->features & ARM_SMMU_FEAT_PRI) { writeq_relaxed(smmu->priq.q.q_base, smmu->base + ARM_SMMU_PRIQ_BASE); - writel_relaxed(smmu->priq.q.prod, + writel_relaxed(smmu->priq.q.llq.prod, arm_smmu_page1_fixup(ARM_SMMU_PRIQ_PROD, smmu)); - writel_relaxed(smmu->priq.q.cons, + writel_relaxed(smmu->priq.q.llq.cons, arm_smmu_page1_fixup(ARM_SMMU_PRIQ_CONS, smmu)); enables |= CR0_PRIQEN; @@ -2909,18 +3402,24 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } /* Queue sizes, capped to ensure natural alignment */ - smmu->cmdq.q.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_CMDQS, reg)); - if (!smmu->cmdq.q.max_n_shift) { - /* Odd alignment restrictions on the base, so ignore for now */ - dev_err(smmu->dev, "unit-length command queue not supported\n"); + smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_CMDQS, reg)); + if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) { + /* + * We don't support splitting up batches, so one batch of + * commands plus an extra sync needs to fit inside the command + * queue. There's also no way we can handle the weird alignment + * restrictions on the base pointer for a unit-length queue. + */ + dev_err(smmu->dev, "command queue size <= %d entries not supported\n", + CMDQ_BATCH_ENTRIES); return -ENXIO; } - smmu->evtq.q.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_EVTQS, reg)); - smmu->priq.q.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, - FIELD_GET(IDR1_PRIQS, reg)); + smmu->evtq.q.llq.max_n_shift = min_t(u32, EVTQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_EVTQS, reg)); + smmu->priq.q.llq.max_n_shift = min_t(u32, PRIQ_MAX_SZ_SHIFT, + FIELD_GET(IDR1_PRIQS, reg)); /* SID/SSID sizes */ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 523a88842e7f..5b93c79371e9 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -366,7 +366,7 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size, * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears * almost negligible, but the benefit of getting the first one in as far ahead * of the sync as possible is significant, hence we don't just make this a - * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think. + * no-op and set .tlb_sync to arm_smmu_tlb_inv_context_s2() as you might think. */ static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size, size_t granule, bool leaf, void *cookie) @@ -380,22 +380,67 @@ static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size, arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_TLBIVMID, smmu_domain->cfg.vmid); } -static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s1, - .tlb_add_flush = arm_smmu_tlb_inv_range_s1, - .tlb_sync = arm_smmu_tlb_sync_context, +static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; + + ops->tlb_inv_range(iova, size, granule, false, cookie); + ops->tlb_sync(cookie); +} + +static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; + + ops->tlb_inv_range(iova, size, granule, true, cookie); + ops->tlb_sync(cookie); +} + +static void arm_smmu_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + struct arm_smmu_domain *smmu_domain = cookie; + const struct arm_smmu_flush_ops *ops = smmu_domain->flush_ops; + + ops->tlb_inv_range(iova, granule, granule, true, cookie); +} + +static const struct arm_smmu_flush_ops arm_smmu_s1_tlb_ops = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s1, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + }, + .tlb_inv_range = arm_smmu_tlb_inv_range_s1, + .tlb_sync = arm_smmu_tlb_sync_context, }; -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s2, - .tlb_add_flush = arm_smmu_tlb_inv_range_s2, - .tlb_sync = arm_smmu_tlb_sync_context, +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v2 = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s2, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + }, + .tlb_inv_range = arm_smmu_tlb_inv_range_s2, + .tlb_sync = arm_smmu_tlb_sync_context, }; -static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = { - .tlb_flush_all = arm_smmu_tlb_inv_context_s2, - .tlb_add_flush = arm_smmu_tlb_inv_vmid_nosync, - .tlb_sync = arm_smmu_tlb_sync_vmid, +static const struct arm_smmu_flush_ops arm_smmu_s2_tlb_ops_v1 = { + .tlb = { + .tlb_flush_all = arm_smmu_tlb_inv_context_s2, + .tlb_flush_walk = arm_smmu_tlb_inv_walk, + .tlb_flush_leaf = arm_smmu_tlb_inv_leaf, + .tlb_add_page = arm_smmu_tlb_add_page, + }, + .tlb_inv_range = arm_smmu_tlb_inv_vmid_nosync, + .tlb_sync = arm_smmu_tlb_sync_vmid, }; static irqreturn_t arm_smmu_context_fault(int irq, void *dev) @@ -667,7 +712,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, ias = min(ias, 32UL); oas = min(oas, 32UL); } - smmu_domain->tlb_ops = &arm_smmu_s1_tlb_ops; + smmu_domain->flush_ops = &arm_smmu_s1_tlb_ops; break; case ARM_SMMU_DOMAIN_NESTED: /* @@ -687,9 +732,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, oas = min(oas, 40UL); } if (smmu->version == ARM_SMMU_V2) - smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v2; + smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v2; else - smmu_domain->tlb_ops = &arm_smmu_s2_tlb_ops_v1; + smmu_domain->flush_ops = &arm_smmu_s2_tlb_ops_v1; break; default: ret = -EINVAL; @@ -725,7 +770,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .ias = ias, .oas = oas, .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENT_WALK, - .tlb = smmu_domain->tlb_ops, + .tlb = &smmu_domain->flush_ops->tlb, .iommu_dev = smmu->dev, }; @@ -1131,7 +1176,7 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, } static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; @@ -1141,7 +1186,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, return 0; arm_smmu_rpm_get(smmu); - ret = ops->unmap(ops, iova, size); + ret = ops->unmap(ops, iova, size, gather); arm_smmu_rpm_put(smmu); return ret; @@ -1152,21 +1197,22 @@ static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - if (smmu_domain->tlb_ops) { + if (smmu_domain->flush_ops) { arm_smmu_rpm_get(smmu); - smmu_domain->tlb_ops->tlb_flush_all(smmu_domain); + smmu_domain->flush_ops->tlb.tlb_flush_all(smmu_domain); arm_smmu_rpm_put(smmu); } } -static void arm_smmu_iotlb_sync(struct iommu_domain *domain) +static void arm_smmu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - if (smmu_domain->tlb_ops) { + if (smmu_domain->flush_ops) { arm_smmu_rpm_get(smmu); - smmu_domain->tlb_ops->tlb_sync(smmu_domain); + smmu_domain->flush_ops->tlb_sync(smmu_domain); arm_smmu_rpm_put(smmu); } } diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h index ac9eac966cf5..b19b6cae9b5e 100644 --- a/drivers/iommu/arm-smmu.h +++ b/drivers/iommu/arm-smmu.h @@ -304,10 +304,17 @@ enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_BYPASS, }; +struct arm_smmu_flush_ops { + struct iommu_flush_ops tlb; + void (*tlb_inv_range)(unsigned long iova, size_t size, size_t granule, + bool leaf, void *cookie); + void (*tlb_sync)(void *cookie); +}; + struct arm_smmu_domain { struct arm_smmu_device *smmu; struct io_pgtable_ops *pgtbl_ops; - const struct iommu_gather_ops *tlb_ops; + const struct arm_smmu_flush_ops *flush_ops; struct arm_smmu_cfg cfg; enum arm_smmu_domain_stage stage; bool non_strict; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a7f9c3edbcb2..80beb1f5994a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -444,13 +444,18 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_off = iova_offset(iovad, dma_addr); + struct iommu_iotlb_gather iotlb_gather; + size_t unmapped; dma_addr -= iova_off; size = iova_align(iovad, size + iova_off); + iommu_iotlb_gather_init(&iotlb_gather); + + unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather); + WARN_ON(unmapped != size); - WARN_ON(iommu_unmap_fast(domain, dma_addr, size) != size); if (!cookie->fq_domain) - iommu_tlb_sync(domain); + iommu_tlb_sync(domain, &iotlb_gather); iommu_dma_free_iova(cookie, dma_addr, size); } diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index b0c1e5f9daae..cf5af34cb681 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1130,7 +1130,8 @@ static void exynos_iommu_tlb_invalidate_entry(struct exynos_iommu_domain *domain } static size_t exynos_iommu_unmap(struct iommu_domain *iommu_domain, - unsigned long l_iova, size_t size) + unsigned long l_iova, size_t size, + struct iommu_iotlb_gather *gather) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); sysmmu_iova_t iova = (sysmmu_iova_t)l_iova; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ac4172c02244..b9fb8d6ddc6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5147,7 +5147,8 @@ static int intel_iommu_map(struct iommu_domain *domain, } static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct page *freelist = NULL; diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 0fc8dfab2abf..18e7d212c7de 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -362,7 +362,8 @@ static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) return false; } -static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, unsigned long, +static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *, + struct iommu_iotlb_gather *, unsigned long, size_t, int, arm_v7s_iopte *); static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, @@ -383,7 +384,7 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data, size_t sz = ARM_V7S_BLOCK_SIZE(lvl); tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl); - if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz, + if (WARN_ON(__arm_v7s_unmap(data, NULL, iova + i * sz, sz, lvl, tblp) != sz)) return -EINVAL; } else if (ptep[i]) { @@ -493,9 +494,8 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, * a chance for anything to kick off a table walk for the new iova. */ if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) { - io_pgtable_tlb_add_flush(iop, iova, size, - ARM_V7S_BLOCK_SIZE(2), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_V7S_BLOCK_SIZE(2)); } else { wmb(); } @@ -541,12 +541,12 @@ static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); size *= ARM_V7S_CONT_PAGES; - io_pgtable_tlb_add_flush(iop, iova, size, size, true); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_leaf(iop, iova, size, size); return pte; } static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, arm_v7s_iopte blk_pte, arm_v7s_iopte *ptep) @@ -583,15 +583,15 @@ static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, return 0; tablep = iopte_deref(pte, 1); - return __arm_v7s_unmap(data, iova, size, 2, tablep); + return __arm_v7s_unmap(data, gather, iova, size, 2, tablep); } - io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); - io_pgtable_tlb_sync(&data->iop); + io_pgtable_tlb_add_page(&data->iop, gather, iova, size); return size; } static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_v7s_iopte *ptep) { @@ -638,9 +638,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, for (i = 0; i < num_entries; i++) { if (ARM_V7S_PTE_IS_TABLE(pte[i], lvl)) { /* Also flush any partial walks */ - io_pgtable_tlb_add_flush(iop, iova, blk_size, - ARM_V7S_BLOCK_SIZE(lvl + 1), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, blk_size, + ARM_V7S_BLOCK_SIZE(lvl + 1)); ptep = iopte_deref(pte[i], lvl); __arm_v7s_free_table(ptep, lvl + 1, data); } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { @@ -651,8 +650,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_flush(iop, iova, blk_size, - blk_size, true); + io_pgtable_tlb_add_page(iop, gather, iova, blk_size); } iova += blk_size; } @@ -662,23 +660,24 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_v7s_split_blk_unmap(data, iova, size, pte[0], ptep); + return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0], + ptep); } /* Keep on walkin' */ ptep = iopte_deref(pte[0], lvl); - return __arm_v7s_unmap(data, iova, size, lvl + 1, ptep); + return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep); } static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); if (WARN_ON(upper_32_bits(iova))) return 0; - return __arm_v7s_unmap(data, iova, size, 1, data->pgd); + return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd); } static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, @@ -806,22 +805,24 @@ static void dummy_tlb_flush_all(void *cookie) WARN_ON(cookie != cfg_cookie); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, + void *cookie) { WARN_ON(cookie != cfg_cookie); WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_sync(void *cookie) +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { - WARN_ON(cookie != cfg_cookie); + dummy_tlb_flush(iova, granule, granule, cookie); } -static const struct iommu_gather_ops dummy_tlb_ops = { +static const struct iommu_flush_ops dummy_tlb_ops = { .tlb_flush_all = dummy_tlb_flush_all, - .tlb_add_flush = dummy_tlb_add_flush, - .tlb_sync = dummy_tlb_sync, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_flush_leaf = dummy_tlb_flush, + .tlb_add_page = dummy_tlb_add_page, }; #define __FAIL(ops) ({ \ @@ -896,7 +897,7 @@ static int __init arm_v7s_do_selftests(void) size = 1UL << __ffs(cfg.pgsize_bitmap); while (i < loopnr) { iova_start = i * SZ_16M; - if (ops->unmap(ops, iova_start + size, size) != size) + if (ops->unmap(ops, iova_start + size, size, NULL) != size) return __FAIL(ops); /* Remap of partial unmap */ @@ -914,7 +915,7 @@ static int __init arm_v7s_do_selftests(void) for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) { size = 1UL << i; - if (ops->unmap(ops, iova, size) != size) + if (ops->unmap(ops, iova, size, NULL) != size) return __FAIL(ops); if (ops->iova_to_phys(ops, iova + 42)) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 161a7d56264d..4c91359057c5 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -290,6 +289,7 @@ static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_lpae_iopte *ptep); @@ -335,8 +335,10 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); - if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz)) + if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) { + WARN_ON(1); return -EINVAL; + } } __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); @@ -537,6 +539,7 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) } static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, arm_lpae_iopte blk_pte, int lvl, arm_lpae_iopte *ptep) @@ -582,15 +585,15 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, tablep = iopte_deref(pte, data); } else if (unmap_idx >= 0) { - io_pgtable_tlb_add_flush(&data->iop, iova, size, size, true); - io_pgtable_tlb_sync(&data->iop); + io_pgtable_tlb_add_page(&data->iop, gather, iova, size); return size; } - return __arm_lpae_unmap(data, iova, size, lvl, tablep); + return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, + struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, arm_lpae_iopte *ptep) { @@ -612,9 +615,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ - io_pgtable_tlb_add_flush(iop, iova, size, - ARM_LPAE_GRANULE(data), false); - io_pgtable_tlb_sync(iop); + io_pgtable_tlb_flush_walk(iop, iova, size, + ARM_LPAE_GRANULE(data)); ptep = iopte_deref(pte, data); __arm_lpae_free_pgtable(data, lvl + 1, ptep); } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { @@ -625,7 +627,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, */ smp_wmb(); } else { - io_pgtable_tlb_add_flush(iop, iova, size, size, true); + io_pgtable_tlb_add_page(iop, gather, iova, size); } return size; @@ -634,17 +636,17 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, * Insert a table at the next level to map the old region, * minus the part we want to unmap */ - return arm_lpae_split_blk_unmap(data, iova, size, pte, + return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, lvl + 1, ptep); } /* Keep on walkin' */ ptep = iopte_deref(pte, data); - return __arm_lpae_unmap(data, iova, size, lvl + 1, ptep); + return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep); } static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); arm_lpae_iopte *ptep = data->pgd; @@ -653,7 +655,7 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) return 0; - return __arm_lpae_unmap(data, iova, size, lvl, ptep); + return __arm_lpae_unmap(data, gather, iova, size, lvl, ptep); } static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, @@ -1070,22 +1072,24 @@ static void dummy_tlb_flush_all(void *cookie) WARN_ON(cookie != cfg_cookie); } -static void dummy_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule, + void *cookie) { WARN_ON(cookie != cfg_cookie); WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void dummy_tlb_sync(void *cookie) +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) { - WARN_ON(cookie != cfg_cookie); + dummy_tlb_flush(iova, granule, granule, cookie); } -static const struct iommu_gather_ops dummy_tlb_ops __initconst = { +static const struct iommu_flush_ops dummy_tlb_ops __initconst = { .tlb_flush_all = dummy_tlb_flush_all, - .tlb_add_flush = dummy_tlb_add_flush, - .tlb_sync = dummy_tlb_sync, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_flush_leaf = dummy_tlb_flush, + .tlb_add_page = dummy_tlb_add_page, }; static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) @@ -1168,7 +1172,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) /* Partial unmap */ size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap(ops, SZ_1G + size, size) != size) + if (ops->unmap(ops, SZ_1G + size, size, NULL) != size) return __FAIL(ops, i); /* Remap of partial unmap */ @@ -1183,7 +1187,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { size = 1UL << j; - if (ops->unmap(ops, iova, size) != size) + if (ops->unmap(ops, iova, size, NULL) != size) return __FAIL(ops, i); if (ops->iova_to_phys(ops, iova + 42)) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0c674d80c37f..70bfbcc09248 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1862,7 +1862,7 @@ EXPORT_SYMBOL_GPL(iommu_map); static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, - bool sync) + struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; @@ -1899,13 +1899,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain, while (unmapped < size) { size_t pgsize = iommu_pgsize(domain, iova, size - unmapped); - unmapped_page = ops->unmap(domain, iova, pgsize); + unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather); if (!unmapped_page) break; - if (sync && ops->iotlb_range_add) - ops->iotlb_range_add(domain, iova, pgsize); - pr_debug("unmapped: iova 0x%lx size 0x%zx\n", iova, unmapped_page); @@ -1913,9 +1910,6 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - if (sync && ops->iotlb_sync) - ops->iotlb_sync(domain); - trace_unmap(orig_iova, size, unmapped); return unmapped; } @@ -1923,14 +1917,22 @@ static size_t __iommu_unmap(struct iommu_domain *domain, size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - return __iommu_unmap(domain, iova, size, true); + struct iommu_iotlb_gather iotlb_gather; + size_t ret; + + iommu_iotlb_gather_init(&iotlb_gather); + ret = __iommu_unmap(domain, iova, size, &iotlb_gather); + iommu_tlb_sync(domain, &iotlb_gather); + + return ret; } EXPORT_SYMBOL_GPL(iommu_unmap); size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *iotlb_gather) { - return __iommu_unmap(domain, iova, size, false); + return __iommu_unmap(domain, iova, size, iotlb_gather); } EXPORT_SYMBOL_GPL(iommu_unmap_fast); diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ad0098c0c87c..76a8ec343d53 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -361,16 +361,16 @@ static void ipmmu_tlb_flush_all(void *cookie) ipmmu_tlb_invalidate(domain); } -static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, - size_t granule, bool leaf, void *cookie) +static void ipmmu_tlb_flush(unsigned long iova, size_t size, + size_t granule, void *cookie) { - /* The hardware doesn't support selective TLB flush. */ + ipmmu_tlb_flush_all(cookie); } -static const struct iommu_gather_ops ipmmu_gather_ops = { +static const struct iommu_flush_ops ipmmu_flush_ops = { .tlb_flush_all = ipmmu_tlb_flush_all, - .tlb_add_flush = ipmmu_tlb_add_flush, - .tlb_sync = ipmmu_tlb_flush_all, + .tlb_flush_walk = ipmmu_tlb_flush, + .tlb_flush_leaf = ipmmu_tlb_flush, }; /* ----------------------------------------------------------------------------- @@ -480,7 +480,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; domain->cfg.ias = 32; domain->cfg.oas = 40; - domain->cfg.tlb = &ipmmu_gather_ops; + domain->cfg.tlb = &ipmmu_flush_ops; domain->io_domain.geometry.aperture_end = DMA_BIT_MASK(32); domain->io_domain.geometry.force_aperture = true; /* @@ -733,14 +733,14 @@ static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova, } static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); - return domain->iop->unmap(domain->iop, iova, size); + return domain->iop->unmap(domain->iop, iova, size, gather); } -static void ipmmu_iotlb_sync(struct iommu_domain *io_domain) +static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); @@ -748,6 +748,12 @@ static void ipmmu_iotlb_sync(struct iommu_domain *io_domain) ipmmu_tlb_flush_all(domain); } +static void ipmmu_iotlb_sync(struct iommu_domain *io_domain, + struct iommu_iotlb_gather *gather) +{ + ipmmu_flush_iotlb_all(io_domain); +} + static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain, dma_addr_t iova) { @@ -957,7 +963,7 @@ static const struct iommu_ops ipmmu_ops = { .detach_dev = ipmmu_detach_device, .map = ipmmu_map, .unmap = ipmmu_unmap, - .flush_iotlb_all = ipmmu_iotlb_sync, + .flush_iotlb_all = ipmmu_flush_iotlb_all, .iotlb_sync = ipmmu_iotlb_sync, .iova_to_phys = ipmmu_iova_to_phys, .add_device = ipmmu_add_device, diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index b25e2eb9e038..4c0be5b75c28 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -168,20 +168,29 @@ fail: return; } -static void __flush_iotlb_sync(void *cookie) +static void __flush_iotlb_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) { - /* - * Nothing is needed here, the barrier to guarantee - * completion of the tlb sync operation is implicitly - * taken care when the iommu client does a writel before - * kick starting the other master. - */ + __flush_iotlb_range(iova, size, granule, false, cookie); } -static const struct iommu_gather_ops msm_iommu_gather_ops = { +static void __flush_iotlb_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + __flush_iotlb_range(iova, size, granule, true, cookie); +} + +static void __flush_iotlb_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie) +{ + __flush_iotlb_range(iova, granule, granule, true, cookie); +} + +static const struct iommu_flush_ops msm_iommu_flush_ops = { .tlb_flush_all = __flush_iotlb, - .tlb_add_flush = __flush_iotlb_range, - .tlb_sync = __flush_iotlb_sync, + .tlb_flush_walk = __flush_iotlb_walk, + .tlb_flush_leaf = __flush_iotlb_leaf, + .tlb_add_page = __flush_iotlb_page, }; static int msm_iommu_alloc_ctx(unsigned long *map, int start, int end) @@ -345,7 +354,7 @@ static int msm_iommu_domain_config(struct msm_priv *priv) .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 32, - .tlb = &msm_iommu_gather_ops, + .tlb = &msm_iommu_flush_ops, .iommu_dev = priv->dev, }; @@ -509,13 +518,13 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t len) + size_t len, struct iommu_iotlb_gather *gather) { struct msm_priv *priv = to_msm_priv(domain); unsigned long flags; spin_lock_irqsave(&priv->pgtlock, flags); - len = priv->iop->unmap(priv->iop, iova, len); + len = priv->iop->unmap(priv->iop, iova, len, gather); spin_unlock_irqrestore(&priv->pgtlock, flags); return len; @@ -691,6 +700,13 @@ static struct iommu_ops msm_iommu_ops = { .detach_dev = msm_iommu_detach_dev, .map = msm_iommu_map, .unmap = msm_iommu_unmap, + /* + * Nothing is needed here, the barrier to guarantee + * completion of the tlb sync operation is implicitly + * taken care when the iommu client does a writel before + * kick starting the other master. + */ + .iotlb_sync = NULL, .iova_to_phys = msm_iommu_iova_to_phys, .add_device = msm_iommu_add_device, .remove_device = msm_iommu_remove_device, diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 82e4be4dfdaf..0827d51936fa 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -188,10 +188,32 @@ static void mtk_iommu_tlb_sync(void *cookie) } } -static const struct iommu_gather_ops mtk_iommu_gather_ops = { +static void mtk_iommu_tlb_flush_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + mtk_iommu_tlb_add_flush_nosync(iova, size, granule, false, cookie); + mtk_iommu_tlb_sync(cookie); +} + +static void mtk_iommu_tlb_flush_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + mtk_iommu_tlb_add_flush_nosync(iova, size, granule, true, cookie); + mtk_iommu_tlb_sync(cookie); +} + +static void mtk_iommu_tlb_flush_page_nosync(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + mtk_iommu_tlb_add_flush_nosync(iova, granule, granule, true, cookie); +} + +static const struct iommu_flush_ops mtk_iommu_flush_ops = { .tlb_flush_all = mtk_iommu_tlb_flush_all, - .tlb_add_flush = mtk_iommu_tlb_add_flush_nosync, - .tlb_sync = mtk_iommu_tlb_sync, + .tlb_flush_walk = mtk_iommu_tlb_flush_walk, + .tlb_flush_leaf = mtk_iommu_tlb_flush_leaf, + .tlb_add_page = mtk_iommu_tlb_flush_page_nosync, }; static irqreturn_t mtk_iommu_isr(int irq, void *dev_id) @@ -267,7 +289,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom) .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 32, - .tlb = &mtk_iommu_gather_ops, + .tlb = &mtk_iommu_flush_ops, .iommu_dev = data->dev, }; @@ -371,20 +393,27 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t mtk_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); unsigned long flags; size_t unmapsz; spin_lock_irqsave(&dom->pgtlock, flags); - unmapsz = dom->iop->unmap(dom->iop, iova, size); + unmapsz = dom->iop->unmap(dom->iop, iova, size, gather); spin_unlock_irqrestore(&dom->pgtlock, flags); return unmapsz; } -static void mtk_iommu_iotlb_sync(struct iommu_domain *domain) +static void mtk_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data()); +} + +static void mtk_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { mtk_iommu_tlb_sync(mtk_iommu_get_m4u_data()); } @@ -490,7 +519,7 @@ static const struct iommu_ops mtk_iommu_ops = { .detach_dev = mtk_iommu_detach_device, .map = mtk_iommu_map, .unmap = mtk_iommu_unmap, - .flush_iotlb_all = mtk_iommu_iotlb_sync, + .flush_iotlb_all = mtk_iommu_flush_iotlb_all, .iotlb_sync = mtk_iommu_iotlb_sync, .iova_to_phys = mtk_iommu_iova_to_phys, .add_device = mtk_iommu_add_device, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index abeeac488372..7b92ddd5d9fd 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -324,7 +324,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t mtk_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); unsigned long flags; diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index dfb961d8c21b..8039bc5ee425 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1149,7 +1149,7 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, } static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct omap_iommu_domain *omap_domain = to_omap_domain(domain); struct device *dev = omap_domain->dev; diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index a2062d13584f..fd33cf5981d7 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -165,10 +165,32 @@ static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size, } } -static const struct iommu_gather_ops qcom_gather_ops = { +static void qcom_iommu_tlb_flush_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + qcom_iommu_tlb_inv_range_nosync(iova, size, granule, false, cookie); + qcom_iommu_tlb_sync(cookie); +} + +static void qcom_iommu_tlb_flush_leaf(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + qcom_iommu_tlb_inv_range_nosync(iova, size, granule, true, cookie); + qcom_iommu_tlb_sync(cookie); +} + +static void qcom_iommu_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + qcom_iommu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); +} + +static const struct iommu_flush_ops qcom_flush_ops = { .tlb_flush_all = qcom_iommu_tlb_inv_context, - .tlb_add_flush = qcom_iommu_tlb_inv_range_nosync, - .tlb_sync = qcom_iommu_tlb_sync, + .tlb_flush_walk = qcom_iommu_tlb_flush_walk, + .tlb_flush_leaf = qcom_iommu_tlb_flush_leaf, + .tlb_add_page = qcom_iommu_tlb_add_page, }; static irqreturn_t qcom_iommu_fault(int irq, void *dev) @@ -216,7 +238,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap, .ias = 32, .oas = 40, - .tlb = &qcom_gather_ops, + .tlb = &qcom_flush_ops, .iommu_dev = qcom_iommu->dev, }; @@ -418,7 +440,7 @@ static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { size_t ret; unsigned long flags; @@ -435,14 +457,14 @@ static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, */ pm_runtime_get_sync(qcom_domain->iommu->dev); spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags); - ret = ops->unmap(ops, iova, size); + ret = ops->unmap(ops, iova, size, gather); spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags); pm_runtime_put_sync(qcom_domain->iommu->dev); return ret; } -static void qcom_iommu_iotlb_sync(struct iommu_domain *domain) +static void qcom_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); struct io_pgtable *pgtable = container_of(qcom_domain->pgtbl_ops, @@ -455,6 +477,12 @@ static void qcom_iommu_iotlb_sync(struct iommu_domain *domain) pm_runtime_put_sync(qcom_domain->iommu->dev); } +static void qcom_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + qcom_iommu_flush_iotlb_all(domain); +} + static phys_addr_t qcom_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { @@ -582,7 +610,7 @@ static const struct iommu_ops qcom_iommu_ops = { .detach_dev = qcom_iommu_detach_dev, .map = qcom_iommu_map, .unmap = qcom_iommu_unmap, - .flush_iotlb_all = qcom_iommu_iotlb_sync, + .flush_iotlb_all = qcom_iommu_flush_iotlb_all, .iotlb_sync = qcom_iommu_iotlb_sync, .iova_to_phys = qcom_iommu_iova_to_phys, .add_device = qcom_iommu_add_device, diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index dc26d74d79c2..26290f310f90 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -794,7 +794,7 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, } static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 22d4db302c1c..3b0b18e23187 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -314,7 +314,8 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, } static size_t s390_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size) + unsigned long iova, size_t size, + struct iommu_iotlb_gather *gather) { struct s390_domain *s390_domain = to_s390_domain(domain); int flags = ZPCI_PTE_INVALID; diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index 6d40bc1b38bf..3924f7c05544 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -207,7 +207,7 @@ static inline int __gart_iommu_unmap(struct gart_device *gart, } static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t bytes) + size_t bytes, struct iommu_iotlb_gather *gather) { struct gart_device *gart = gart_handle; int err; @@ -273,11 +273,17 @@ static int gart_iommu_of_xlate(struct device *dev, return 0; } -static void gart_iommu_sync(struct iommu_domain *domain) +static void gart_iommu_sync_map(struct iommu_domain *domain) { FLUSH_GART_REGS(gart_handle); } +static void gart_iommu_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + gart_iommu_sync_map(domain); +} + static const struct iommu_ops gart_iommu_ops = { .capable = gart_iommu_capable, .domain_alloc = gart_iommu_domain_alloc, @@ -292,7 +298,7 @@ static const struct iommu_ops gart_iommu_ops = { .iova_to_phys = gart_iommu_iova_to_phys, .pgsize_bitmap = GART_IOMMU_PGSIZES, .of_xlate = gart_iommu_of_xlate, - .iotlb_sync_map = gart_iommu_sync, + .iotlb_sync_map = gart_iommu_sync_map, .iotlb_sync = gart_iommu_sync, }; diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index c4a652b227f8..7293fc3f796d 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -680,7 +680,7 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, } static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { struct tegra_smmu_as *as = to_smmu_as(domain); dma_addr_t pte_dma; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 433f4d2ee956..5f9f91a4d7f3 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -742,7 +742,7 @@ static int viommu_map(struct iommu_domain *domain, unsigned long iova, } static size_t viommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size) + size_t size, struct iommu_iotlb_gather *gather) { int ret = 0; size_t unmapped; @@ -788,7 +788,8 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain, return paddr; } -static void viommu_iotlb_sync(struct iommu_domain *domain) +static void viommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { struct viommu_domain *vdomain = to_viommu_domain(domain); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 054391f30fa8..ad830abe1021 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -650,12 +650,13 @@ unpin_exit: } static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, - struct list_head *regions) + struct list_head *regions, + struct iommu_iotlb_gather *iotlb_gather) { long unlocked = 0; struct vfio_regions *entry, *next; - iommu_tlb_sync(domain->domain); + iommu_tlb_sync(domain->domain, iotlb_gather); list_for_each_entry_safe(entry, next, regions, list) { unlocked += vfio_unpin_pages_remote(dma, @@ -685,18 +686,19 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain, struct vfio_dma *dma, dma_addr_t *iova, size_t len, phys_addr_t phys, long *unlocked, struct list_head *unmapped_list, - int *unmapped_cnt) + int *unmapped_cnt, + struct iommu_iotlb_gather *iotlb_gather) { size_t unmapped = 0; struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry) { - unmapped = iommu_unmap_fast(domain->domain, *iova, len); + unmapped = iommu_unmap_fast(domain->domain, *iova, len, + iotlb_gather); if (!unmapped) { kfree(entry); } else { - iommu_tlb_range_add(domain->domain, *iova, unmapped); entry->iova = *iova; entry->phys = phys; entry->len = unmapped; @@ -712,8 +714,8 @@ static size_t unmap_unpin_fast(struct vfio_domain *domain, * or in case of errors. */ if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { - *unlocked += vfio_sync_unpin(dma, domain, - unmapped_list); + *unlocked += vfio_sync_unpin(dma, domain, unmapped_list, + iotlb_gather); *unmapped_cnt = 0; } @@ -744,6 +746,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, dma_addr_t iova = dma->iova, end = dma->iova + dma->size; struct vfio_domain *domain, *d; LIST_HEAD(unmapped_region_list); + struct iommu_iotlb_gather iotlb_gather; int unmapped_region_cnt = 0; long unlocked = 0; @@ -768,6 +771,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, cond_resched(); } + iommu_iotlb_gather_init(&iotlb_gather); while (iova < end) { size_t unmapped, len; phys_addr_t phys, next; @@ -796,7 +800,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, */ unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys, &unlocked, &unmapped_region_list, - &unmapped_region_cnt); + &unmapped_region_cnt, + &iotlb_gather); if (!unmapped) { unmapped = unmap_unpin_slow(domain, dma, &iova, len, phys, &unlocked); @@ -807,8 +812,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, dma->iommu_mapped = false; - if (unmapped_region_cnt) - unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list); + if (unmapped_region_cnt) { + unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list, + &iotlb_gather); + } if (do_accounting) { vfio_lock_acct(dma, -unlocked, true); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index b5a450a3bb47..6b1b8be3ebec 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -1,7 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IO_PGTABLE_H #define __IO_PGTABLE_H + #include +#include /* * Public API for use by IOMMU drivers @@ -17,22 +19,31 @@ enum io_pgtable_fmt { }; /** - * struct iommu_gather_ops - IOMMU callbacks for TLB and page table management. + * struct iommu_flush_ops - IOMMU callbacks for TLB and page table management. * - * @tlb_flush_all: Synchronously invalidate the entire TLB context. - * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range. - * @tlb_sync: Ensure any queued TLB invalidation has taken effect, and - * any corresponding page table updates are visible to the - * IOMMU. + * @tlb_flush_all: Synchronously invalidate the entire TLB context. + * @tlb_flush_walk: Synchronously invalidate all intermediate TLB state + * (sometimes referred to as the "walk cache") for a virtual + * address range. + * @tlb_flush_leaf: Synchronously invalidate all leaf TLB state for a virtual + * address range. + * @tlb_add_page: Optional callback to queue up leaf TLB invalidation for a + * single page. IOMMUs that cannot batch TLB invalidation + * operations efficiently will typically issue them here, but + * others may decide to update the iommu_iotlb_gather structure + * and defer the invalidation until iommu_tlb_sync() instead. * * Note that these can all be called in atomic context and must therefore * not block. */ -struct iommu_gather_ops { +struct iommu_flush_ops { void (*tlb_flush_all)(void *cookie); - void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule, - bool leaf, void *cookie); - void (*tlb_sync)(void *cookie); + void (*tlb_flush_walk)(unsigned long iova, size_t size, size_t granule, + void *cookie); + void (*tlb_flush_leaf)(unsigned long iova, size_t size, size_t granule, + void *cookie); + void (*tlb_add_page)(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, void *cookie); }; /** @@ -84,7 +95,7 @@ struct io_pgtable_cfg { unsigned int ias; unsigned int oas; bool coherent_walk; - const struct iommu_gather_ops *tlb; + const struct iommu_flush_ops *tlb; struct device *iommu_dev; /* Low-level data specific to the table format */ @@ -128,7 +139,7 @@ struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, - size_t size); + size_t size, struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); }; @@ -184,15 +195,27 @@ static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop) iop->cfg.tlb->tlb_flush_all(iop->cookie); } -static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop, - unsigned long iova, size_t size, size_t granule, bool leaf) +static inline void +io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova, + size_t size, size_t granule) { - iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie); + iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie); } -static inline void io_pgtable_tlb_sync(struct io_pgtable *iop) +static inline void +io_pgtable_tlb_flush_leaf(struct io_pgtable *iop, unsigned long iova, + size_t size, size_t granule) { - iop->cfg.tlb->tlb_sync(iop->cookie); + iop->cfg.tlb->tlb_flush_leaf(iova, size, granule, iop->cookie); +} + +static inline void +io_pgtable_tlb_add_page(struct io_pgtable *iop, + struct iommu_iotlb_gather * gather, unsigned long iova, + size_t granule) +{ + if (iop->cfg.tlb->tlb_add_page) + iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie); } /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fdc355ccc570..64ebaff33455 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -191,6 +191,23 @@ struct iommu_sva_ops { #ifdef CONFIG_IOMMU_API +/** + * struct iommu_iotlb_gather - Range information for a pending IOTLB flush + * + * @start: IOVA representing the start of the range to be flushed + * @end: IOVA representing the end of the range to be flushed (exclusive) + * @pgsize: The interval at which to perform the flush + * + * This structure is intended to be updated by multiple calls to the + * ->unmap() function in struct iommu_ops before eventually being passed + * into ->iotlb_sync(). + */ +struct iommu_iotlb_gather { + unsigned long start; + unsigned long end; + size_t pgsize; +}; + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability @@ -201,7 +218,6 @@ struct iommu_sva_ops { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain - * @iotlb_range_add: Add a given iova range to the flush queue for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue @@ -242,12 +258,11 @@ struct iommu_ops { int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, - size_t size); + size_t size, struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); - void (*iotlb_range_add)(struct iommu_domain *domain, - unsigned long iova, size_t size); void (*iotlb_sync_map)(struct iommu_domain *domain); - void (*iotlb_sync)(struct iommu_domain *domain); + void (*iotlb_sync)(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); int (*add_device)(struct device *dev); void (*remove_device)(struct device *dev); @@ -378,6 +393,13 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev) return (struct iommu_device *)dev_get_drvdata(dev); } +static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) +{ + *gather = (struct iommu_iotlb_gather) { + .start = ULONG_MAX, + }; +} + #define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ #define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ #define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ @@ -402,7 +424,8 @@ extern int iommu_map(struct iommu_domain *domain, unsigned long iova, extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size); extern size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, size_t size); + unsigned long iova, size_t size, + struct iommu_iotlb_gather *iotlb_gather); extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg,unsigned int nents, int prot); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); @@ -476,17 +499,38 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) domain->ops->flush_iotlb_all(domain); } -static inline void iommu_tlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ - if (domain->ops->iotlb_range_add) - domain->ops->iotlb_range_add(domain, iova, size); -} - -static inline void iommu_tlb_sync(struct iommu_domain *domain) +static inline void iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather) { if (domain->ops->iotlb_sync) - domain->ops->iotlb_sync(domain); + domain->ops->iotlb_sync(domain, iotlb_gather); + + iommu_iotlb_gather_init(iotlb_gather); +} + +static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long start = iova, end = start + size; + + /* + * If the new page is disjoint from the current range or is mapped at + * a different granularity, then sync the TLB so that the gather + * structure can be rewritten. + */ + if (gather->pgsize != size || + end < gather->start || start > gather->end) { + if (gather->pgsize) + iommu_tlb_sync(domain, gather); + gather->pgsize = size; + } + + if (gather->end < end) + gather->end = end; + + if (gather->start > start) + gather->start = start; } /* PCI device grouping function */ @@ -567,6 +611,7 @@ struct iommu_group {}; struct iommu_fwspec {}; struct iommu_device {}; struct iommu_fault_param {}; +struct iommu_iotlb_gather {}; static inline bool iommu_present(struct bus_type *bus) { @@ -621,7 +666,8 @@ static inline size_t iommu_unmap(struct iommu_domain *domain, } static inline size_t iommu_unmap_fast(struct iommu_domain *domain, - unsigned long iova, int gfp_order) + unsigned long iova, int gfp_order, + struct iommu_iotlb_gather *iotlb_gather) { return 0; } @@ -637,12 +683,8 @@ static inline void iommu_flush_tlb_all(struct iommu_domain *domain) { } -static inline void iommu_tlb_range_add(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ -} - -static inline void iommu_tlb_sync(struct iommu_domain *domain) +static inline void iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather) { } @@ -827,6 +869,16 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev) return NULL; } +static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) +{ +} + +static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { }