From 66e8d3b8995c05cb1e17b120f52e0218bc534a49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 17:02:14 +0100 Subject: [PATCH 01/11] sdhci: stop poking into swiotlb internals Use the proper API to query the max mapping size instead of guessing it based on swiotlb internals. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/mmc/host/sdhci.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 646823ddd317..2d73407ee52e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -4582,12 +4581,8 @@ int sdhci_setup_host(struct sdhci_host *host) mmc->max_segs = SDHCI_MAX_SEGS; } else if (host->flags & SDHCI_USE_SDMA) { mmc->max_segs = 1; - if (swiotlb_max_segment()) { - unsigned int max_req_size = (1 << IO_TLB_SHIFT) * - IO_TLB_SEGSIZE; - mmc->max_req_size = min(mmc->max_req_size, - max_req_size); - } + mmc->max_req_size = min_t(size_t, mmc->max_req_size, + dma_max_mapping_size(mmc_dev(mmc))); } else { /* PIO */ mmc->max_segs = SDHCI_MAX_SEGS; } From 36950f2da1ea4cb683be174f6f581e25b2d33e71 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Mon, 1 Feb 2021 10:30:15 -0800 Subject: [PATCH 02/11] driver core: add a min_align_mask field to struct device_dma_parameters Some devices rely on the address offset in a page to function correctly (NVMe driver as an example). These devices may use a different page size than the Linux kernel. The address offset has to be preserved upon mapping, and in order to do so, we need to record the page_offset_mask first. Signed-off-by: Jianxiong Gao Signed-off-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/device.h | 1 + include/linux/dma-mapping.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 1779f90eeb4c..7960bf516dd7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -291,6 +291,7 @@ struct device_dma_parameters { * sg limitations. */ unsigned int max_segment_size; + unsigned int min_align_mask; unsigned long segment_boundary_mask; }; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2e49996a8f39..9c26225754e7 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -500,6 +500,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) return -EIO; } +static inline unsigned int dma_get_min_align_mask(struct device *dev) +{ + if (dev->dma_parms) + return dev->dma_parms->min_align_mask; + return 0; +} + +static inline int dma_set_min_align_mask(struct device *dev, + unsigned int min_align_mask) +{ + if (WARN_ON_ONCE(!dev->dma_parms)) + return -EIO; + dev->dma_parms->min_align_mask = min_align_mask; + return 0; +} + static inline int dma_get_cache_alignment(void) { #ifdef ARCH_DMA_MINALIGN From b5d7ccb7aac3895c2138fe0980a109116ce15eff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 11:18:40 +0100 Subject: [PATCH 03/11] swiotlb: add a IO_TLB_SIZE define Add a new IO_TLB_SIZE define instead open coding it using IO_TLB_SHIFT all over. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 + kernel/dma/swiotlb.c | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d9c9fc9ca5d2..5857a937c637 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,6 +29,7 @@ enum swiotlb_force { * controllable. */ #define IO_TLB_SHIFT 11 +#define IO_TLB_SIZE (1 << IO_TLB_SHIFT) /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7c42df6e6100..768187d0e17e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -171,7 +171,7 @@ void __init swiotlb_adjust_size(unsigned long new_size) * adjust/expand SWIOTLB size for their use. */ if (!io_tlb_nslabs) { - size = ALIGN(new_size, 1 << IO_TLB_SHIFT); + size = ALIGN(new_size, IO_TLB_SIZE); io_tlb_nslabs = size >> IO_TLB_SHIFT; io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); @@ -491,20 +491,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -598,7 +598,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -649,7 +649,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: @@ -707,7 +707,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } bool is_swiotlb_active(void) From c7fbeca757fe74135d8b6a4c8ddaef76f5775d68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 10:11:20 +0100 Subject: [PATCH 04/11] swiotlb: factor out an io_tlb_offset helper Replace the very genericly named OFFSET macro with a little inline helper that hardcodes the alignment to the only value ever passed. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 768187d0e17e..7705821dcdbd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,9 +50,6 @@ #define CREATE_TRACE_POINTS #include -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* @@ -192,6 +189,11 @@ void swiotlb_print_info(void) bytes >> 20); } +static inline unsigned long io_tlb_offset(unsigned long val) +{ + return val & (IO_TLB_SEGSIZE - 1); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -241,7 +243,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) __func__, alloc_size, PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -375,7 +377,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) goto cleanup4; for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -546,7 +548,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, for (i = index; i < (int) (index + nslots); i++) io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); @@ -632,7 +636,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * Step 2: merge the returned slots with the preceding slots, * if available (non zero) */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; io_tlb_used -= nslots; From c32a77fd18780a5192dfb6eec69f239faebf28fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 11:19:34 +0100 Subject: [PATCH 05/11] swiotlb: factor out a nr_slots helper Factor out a helper to find the number of slots for a given size. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7705821dcdbd..9492219b0743 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -194,6 +194,11 @@ static inline unsigned long io_tlb_offset(unsigned long val) return val & (IO_TLB_SEGSIZE - 1); } +static inline unsigned long nr_slots(u64 val) +{ + return DIV_ROUND_UP(val, IO_TLB_SIZE); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -493,20 +498,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; + offset_slots = nr_slots(tbl_dma_addr); /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT + ? nr_slots(mask + 1) : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + nslots = nr_slots(alloc_size); if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -602,7 +607,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + int i, count, nslots = nr_slots(alloc_size); int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; From ca10d0f8e530600ec63c603dbace2c30927d70b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 10:13:40 +0100 Subject: [PATCH 06/11] swiotlb: clean up swiotlb_tbl_unmap_single Remove a layer of pointless indentation, replace a hard to follow ternary expression with a plain if/else. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9492219b0743..b38b1553c466 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -626,28 +626,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * with slots below and above the pool being returned. */ spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) + count = io_tlb_list[index + nslots]; + else + count = 0; - io_tlb_used -= nslots; + /* + * Step 1: return the slots to the free list, merging the slots with + * superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } + + /* + * Step 2: merge the returned slots with the preceding slots, if + * available (non zero) + */ + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + io_tlb_used -= nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); } From 26a7e094783d482f3e125f09945a5bb1d867b2e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Feb 2021 11:08:35 +0100 Subject: [PATCH 07/11] swiotlb: refactor swiotlb_tbl_map_single Split out a bunch of a self-contained helpers to make the function easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 205 +++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 103 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b38b1553c466..6962cb4efb08 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -468,19 +468,99 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, +#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) + +/* + * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. + */ +static inline unsigned long get_max_slots(unsigned long boundary_mask) +{ + if (boundary_mask == ~0UL) + return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + return nr_slots(boundary_mask + 1); +} + +static unsigned int wrap_index(unsigned int index) +{ + if (index >= io_tlb_nslabs) + return 0; + return index; +} + +/* + * Find a suitable number of IO TLB entries size that will fit this request and + * allocate a buffer from that IO TLB pool. + */ +static int find_slots(struct device *dev, size_t alloc_size) +{ + unsigned long boundary_mask = dma_get_seg_boundary(dev); + dma_addr_t tbl_dma_addr = + phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + unsigned long max_slots = get_max_slots(boundary_mask); + unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int index, wrap, count = 0, i; + unsigned long flags; + + BUG_ON(!nslots); + + /* + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. + */ + if (alloc_size >= PAGE_SIZE) + stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); + + spin_lock_irqsave(&io_tlb_lock, flags); + if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + goto not_found; + + index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + do { + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (!iommu_is_span_boundary(index, nslots, + nr_slots(tbl_dma_addr), + max_slots)) { + if (io_tlb_list[index] >= nslots) + goto found; + } + index = wrap_index(index + stride); + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + return -1; + +found: + for (i = index; i < index + nslots; i++) + io_tlb_list[i] = 0; + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + + /* + * Update the indices to avoid searching in the next round. + */ + if (index + nslots < io_tlb_nslabs) + io_tlb_index = index + nslots; + else + io_tlb_index = 0; + io_tlb_used += nslots; + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return index; +} + +phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); - unsigned long flags; + unsigned int index, i; phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -489,113 +569,32 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { - dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", mapping_size, alloc_size); return (phys_addr_t)DMA_MAPPING_ERROR; } - mask = dma_get_seg_boundary(hwdev); - - tbl_dma_addr &= mask; - - offset_slots = nr_slots(tbl_dma_addr); - - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? nr_slots(mask + 1) - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - - /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. - */ - nslots = nr_slots(alloc_size); - if (alloc_size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; - - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) - goto not_found; - - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - - do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - } while (index != wrap); - -not_found: - tmp_io_tlb_used = io_tlb_used; - - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, tmp_io_tlb_used); - return (phys_addr_t)DMA_MAPPING_ERROR; -found: - io_tlb_used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + index = find_slots(dev, alloc_size); + if (index == -1) { + if (!(attrs & DMA_ATTR_NO_WARN)) + dev_warn_ratelimited(dev, + "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + alloc_size, io_tlb_nslabs, io_tlb_used); + return (phys_addr_t)DMA_MAPPING_ERROR; + } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + for (i = 0; i < nr_slots(alloc_size); i++) + io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); + + tlb_addr = slot_addr(io_tlb_start, index); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); - return tlb_addr; } From 16fc3cef33a04632ab6b31758abdd77563a20759 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Feb 2021 10:44:16 +0100 Subject: [PATCH 08/11] swiotlb: don't modify orig_addr in swiotlb_tbl_sync_single swiotlb_tbl_map_single currently nevers sets a tlb_addr that is not aligned to the tlb bucket size. But we're going to add such a case soon, for which this adjustment would be bogus. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6962cb4efb08..bf063badda1d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -660,7 +660,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: From 1f221a0d0dbf0e48ef3a9c62871281d6a7819f05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Feb 2021 14:39:44 -0500 Subject: [PATCH 09/11] swiotlb: respect min_align_mask Respect the min_align_mask in struct device_dma_parameters in swiotlb. There are two parts to it: 1) for the lower bits of the alignment inside the io tlb slot, just extent the size of the allocation and leave the start of the slot empty 2) for the high bits ensure we find a slot that matches the high bits of the alignment to avoid wasting too much memory Based on an earlier patch from Jianxiong Gao . Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index bf063badda1d..010d531999a6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -470,6 +470,14 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, #define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +/* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ @@ -491,24 +499,29 @@ static unsigned int wrap_index(unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, size_t alloc_size) +static int find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int iotlb_align_mask = + dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; unsigned long flags; BUG_ON(!nslots); /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. + * For mappings with an alignment requirement don't bother looping to + * unaligned slots once we found an aligned one. For allocations of + * PAGE_SIZE or larger only look for page aligned allocations. */ + stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; if (alloc_size >= PAGE_SIZE) - stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); + stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); spin_lock_irqsave(&io_tlb_lock, flags); if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) @@ -516,6 +529,12 @@ static int find_slots(struct device *dev, size_t alloc_size) index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); do { + if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { + index = wrap_index(index + 1); + continue; + } + /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot @@ -559,6 +578,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, i; phys_addr_t tlb_addr; @@ -574,7 +594,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, alloc_size); + index = find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -588,10 +608,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size); i++) + for (i = 0; i < nr_slots(alloc_size + offset); i++) io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - tlb_addr = slot_addr(io_tlb_start, index); + tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); @@ -606,8 +626,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = nr_slots(alloc_size); - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + int i, count, nslots = nr_slots(alloc_size + offset); + int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; /* From 3d2d861eb03e8ee96dc430a54361c900cbe28afd Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Mon, 1 Feb 2021 10:30:17 -0800 Subject: [PATCH 10/11] nvme-pci: set min_align_mask The PRP addressing scheme requires all PRP entries except for the first one to have a zero offset into the NVMe controller pages (which can be different from the Linux PAGE_SIZE). Use the min_align_mask device parameter to ensure that swiotlb does not change the address of the buffer modulo the device page size to ensure that the PRPs won't be malformed. Signed-off-by: Jianxiong Gao Signed-off-by: Christoph Hellwig Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 81e6389b2042..83303e4e3019 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2629,6 +2629,7 @@ static void nvme_reset_work(struct work_struct *work) * Don't limit the IOMMU merged segment size. */ dma_set_max_seg_size(dev->dev, 0xffffffff); + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); mutex_unlock(&dev->shutdown_lock); From daf9514fd5eb098d7d6f3a1247cb8cc48fc94155 Mon Sep 17 00:00:00 2001 From: Martin Radev Date: Tue, 12 Jan 2021 16:07:29 +0100 Subject: [PATCH 11/11] swiotlb: Validate bounce size in the sync/unmap path The size of the buffer being bounced is not checked if it happens to be larger than the size of the mapped buffer. Because the size can be controlled by a device, as it's the case with virtio devices, this can lead to memory corruption. This patch saves the remaining buffer memory for each slab and uses that information for validation in the sync/unmap paths before swiotlb_bounce is called. Validating this argument is important under the threat models of AMD SEV-SNP and Intel TDX, where the HV is considered untrusted. Signed-off-by: Martin Radev Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 53 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 010d531999a6..c10e855a03bc 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -99,6 +99,11 @@ static unsigned int max_segment; #define INVALID_PHYS_ADDR (~(phys_addr_t)0) static phys_addr_t *io_tlb_orig_addr; +/* + * The mapped buffer's size should be validated during a sync operation. + */ +static size_t *io_tlb_orig_size; + /* * Protect the above data structures in the map and unmap calls */ @@ -247,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); + io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_size) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -370,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) * between io_tlb_start and io_tlb_end. */ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); + get_order(io_tlb_nslabs * sizeof(int))); if (!io_tlb_list) goto cleanup3; @@ -381,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!io_tlb_orig_addr) goto cleanup4; + io_tlb_orig_size = (size_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(size_t))); + if (!io_tlb_orig_size) + goto cleanup5; + + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -396,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) return 0; +cleanup5: + free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + cleanup4: free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); @@ -411,6 +436,8 @@ void __init swiotlb_exit(void) return; if (late_alloc) { + free_pages((unsigned long)io_tlb_orig_size, + get_order(io_tlb_nslabs * sizeof(size_t))); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -420,6 +447,8 @@ void __init swiotlb_exit(void) } else { memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_orig_size), + PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); memblock_free_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); memblock_free_late(io_tlb_start, @@ -608,9 +637,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size + offset); i++) + for (i = 0; i < nr_slots(alloc_size + offset); i++) { io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - + io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + } tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) @@ -618,6 +648,17 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } +static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) +{ + if (*size > orig_size) { + /* Warn and truncate mapping_size */ + dev_WARN_ONCE(hwdev, 1, + "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", + orig_size, *size); + *size = orig_size; + } +} + /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -631,6 +672,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; + validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + /* * First, sync the memory before unmapping the entry */ @@ -658,6 +701,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, for (i = index + nslots - 1; i >= index; i--) { io_tlb_list[i] = ++count; io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } /* @@ -677,11 +721,14 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_sync_target target) { int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + size_t orig_size = io_tlb_orig_size[index]; phys_addr_t orig_addr = io_tlb_orig_addr[index]; if (orig_addr == INVALID_PHYS_ADDR) return; + validate_sync_size_and_truncate(hwdev, orig_size, &size); + switch (target) { case SYNC_FOR_CPU: if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))