From d751751a9f7f2f8e406d5a09565d337f009835d6 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Fri, 24 Feb 2017 12:13:37 +0100
Subject: [PATCH 01/17] iommu/iova: Consolidate code for adding new node to
 iovad domain rbtree

This patch consolidates almost the same code used in iova_insert_rbtree()
and __alloc_and_insert_iova_range() functions. While touching this code,
replace BUG() with WARN_ON(1) to avoid taking down the whole system in
case of corrupted iova tree or incorrect calls.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 87 +++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 54 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b7268a14184f..e80a4105ac2a 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -100,6 +100,34 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 	}
 }
 
+/* Insert the iova into domain rbtree by holding writer lock */
+static void
+iova_insert_rbtree(struct rb_root *root, struct iova *iova,
+		   struct rb_node *start)
+{
+	struct rb_node **new, *parent = NULL;
+
+	new = (start) ? &start : &(root->rb_node);
+	/* Figure out where to put new node */
+	while (*new) {
+		struct iova *this = rb_entry(*new, struct iova, node);
+
+		parent = *new;
+
+		if (iova->pfn_lo < this->pfn_lo)
+			new = &((*new)->rb_left);
+		else if (iova->pfn_lo > this->pfn_lo)
+			new = &((*new)->rb_right);
+		else {
+			WARN_ON(1); /* this should not happen */
+			return;
+		}
+	}
+	/* Add new node and rebalance tree. */
+	rb_link_node(&iova->node, parent, new);
+	rb_insert_color(&iova->node, root);
+}
+
 /*
  * Computes the padding size required, to make the start address
  * naturally aligned on the power-of-two order of its size
@@ -157,35 +185,8 @@ move_left:
 	new->pfn_lo = limit_pfn - (size + pad_size) + 1;
 	new->pfn_hi = new->pfn_lo + size - 1;
 
-	/* Insert the new_iova into domain rbtree by holding writer lock */
-	/* Add new node and rebalance tree. */
-	{
-		struct rb_node **entry, *parent = NULL;
-
-		/* If we have 'prev', it's a valid place to start the
-		   insertion. Otherwise, start from the root. */
-		if (prev)
-			entry = &prev;
-		else
-			entry = &iovad->rbroot.rb_node;
-
-		/* Figure out where to put new node */
-		while (*entry) {
-			struct iova *this = rb_entry(*entry, struct iova, node);
-			parent = *entry;
-
-			if (new->pfn_lo < this->pfn_lo)
-				entry = &((*entry)->rb_left);
-			else if (new->pfn_lo > this->pfn_lo)
-				entry = &((*entry)->rb_right);
-			else
-				BUG(); /* this should not happen */
-		}
-
-		/* Add new node and rebalance tree. */
-		rb_link_node(&new->node, parent, entry);
-		rb_insert_color(&new->node, &iovad->rbroot);
-	}
+	/* If we have 'prev', it's a valid place to start the insertion. */
+	iova_insert_rbtree(&iovad->rbroot, new, prev);
 	__cached_rbnode_insert_update(iovad, saved_pfn, new);
 
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
@@ -194,28 +195,6 @@ move_left:
 	return 0;
 }
 
-static void
-iova_insert_rbtree(struct rb_root *root, struct iova *iova)
-{
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-	/* Figure out where to put new node */
-	while (*new) {
-		struct iova *this = rb_entry(*new, struct iova, node);
-
-		parent = *new;
-
-		if (iova->pfn_lo < this->pfn_lo)
-			new = &((*new)->rb_left);
-		else if (iova->pfn_lo > this->pfn_lo)
-			new = &((*new)->rb_right);
-		else
-			BUG(); /* this should not happen */
-	}
-	/* Add new node and rebalance tree. */
-	rb_link_node(&iova->node, parent, new);
-	rb_insert_color(&iova->node, root);
-}
-
 static struct kmem_cache *iova_cache;
 static unsigned int iova_cache_users;
 static DEFINE_MUTEX(iova_cache_mutex);
@@ -505,7 +484,7 @@ __insert_new_range(struct iova_domain *iovad,
 
 	iova = alloc_and_init_iova(pfn_lo, pfn_hi);
 	if (iova)
-		iova_insert_rbtree(&iovad->rbroot, iova);
+		iova_insert_rbtree(&iovad->rbroot, iova, NULL);
 
 	return iova;
 }
@@ -612,11 +591,11 @@ split_and_remove_iova(struct iova_domain *iovad, struct iova *iova,
 	rb_erase(&iova->node, &iovad->rbroot);
 
 	if (prev) {
-		iova_insert_rbtree(&iovad->rbroot, prev);
+		iova_insert_rbtree(&iovad->rbroot, prev, NULL);
 		iova->pfn_lo = pfn_lo;
 	}
 	if (next) {
-		iova_insert_rbtree(&iovad->rbroot, next);
+		iova_insert_rbtree(&iovad->rbroot, next, NULL);
 		iova->pfn_hi = pfn_hi;
 	}
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);

From 82df0a4368c62e68d6622e19a8f4dca2c7eccd3a Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 20 Mar 2017 20:11:27 +0100
Subject: [PATCH 02/17] MAINTAINERS: Add related headers to IOMMU section

The linux/iommu.h and linux/iova.h headers belong to the IOMMU subsystem
but scripts/get_maintainers.pl currently fails to assign them because
they aren't listed in MAINTAINERS.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c776906f67a9..0a7c6e0e3e82 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6764,6 +6764,8 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/
+F:	include/linux/iommu.h
+F:	include/linux/iova.h
 
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>

From 21aff52ab2c831c2f07d48e2fa8d4bab26a66992 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 20 Mar 2017 20:11:28 +0100
Subject: [PATCH 03/17] iommu: Add dummy implementations for !IOMMU_IOVA

Currently, building code which uses the API guarded by the IOMMU_IOVA
will fail to link if IOMMU_IOVA is not enabled. Often this code will be
using the API provided by the IOMMU_API Kconfig symbol, but support for
this can be optional, with code falling back to contiguous memory. This
commit implements dummy functions for the IOVA API so that it can be
compiled out.

With both IOMMU_API and IOMMU_IOVA optional, code can now be built with
or without support for IOMMU without having to resort to #ifdefs in the
user code.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 91 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/include/linux/iova.h b/include/linux/iova.h
index f27bb2c62fca..548982ad5f2f 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -82,6 +82,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
+#ifdef CONFIG_IOMMU_IOVA
 int iova_cache_get(void);
 void iova_cache_put(void);
 
@@ -106,5 +107,95 @@ void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
 void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
+#else
+static inline int iova_cache_get(void)
+{
+	return -ENOTSUPP;
+}
+
+static inline void iova_cache_put(void)
+{
+}
+
+static inline struct iova *alloc_iova_mem(void)
+{
+	return NULL;
+}
+
+static inline void free_iova_mem(struct iova *iova)
+{
+}
+
+static inline void free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+}
+
+static inline void __free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+}
+
+static inline struct iova *alloc_iova(struct iova_domain *iovad,
+				      unsigned long size,
+				      unsigned long limit_pfn,
+				      bool size_aligned)
+{
+	return NULL;
+}
+
+static inline void free_iova_fast(struct iova_domain *iovad,
+				  unsigned long pfn,
+				  unsigned long size)
+{
+}
+
+static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
+					    unsigned long size,
+					    unsigned long limit_pfn)
+{
+	return 0;
+}
+
+static inline struct iova *reserve_iova(struct iova_domain *iovad,
+					unsigned long pfn_lo,
+					unsigned long pfn_hi)
+{
+	return NULL;
+}
+
+static inline void copy_reserved_iova(struct iova_domain *from,
+				      struct iova_domain *to)
+{
+}
+
+static inline void init_iova_domain(struct iova_domain *iovad,
+				    unsigned long granule,
+				    unsigned long start_pfn,
+				    unsigned long pfn_32bit)
+{
+}
+
+static inline struct iova *find_iova(struct iova_domain *iovad,
+				     unsigned long pfn)
+{
+	return NULL;
+}
+
+static inline void put_iova_domain(struct iova_domain *iovad)
+{
+}
+
+static inline struct iova *split_and_remove_iova(struct iova_domain *iovad,
+						 struct iova *iova,
+						 unsigned long pfn_lo,
+						 unsigned long pfn_hi)
+{
+	return NULL;
+}
+
+static inline void free_cpu_cached_iovas(unsigned int cpu,
+					 struct iova_domain *iovad)
+{
+}
+#endif
 
 #endif

From b4d8c7aea15efa8c6272c58d78296f8b017c4c6a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 23 Mar 2017 00:06:17 +0100
Subject: [PATCH 04/17] iommu/iova: Fix compile error with CONFIG_IOMMU_IOVA=m

The #ifdef in iova.h only catches the CONFIG_IOMMU_IOVA=y
case, so that compilation as a module fails with duplicate
function definition errors. Fix it by catching both cases in
the #if.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 548982ad5f2f..e0a892ae45c0 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -82,7 +82,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
-#ifdef CONFIG_IOMMU_IOVA
+#if IS_ENABLED(CONFIG_IOMMU_IOVA)
 int iova_cache_get(void);
 void iova_cache_put(void);
 

From 8cadb01d2c2f520a890d1bc78b45471f21b9b76d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 9 Mar 2017 20:04:57 +0100
Subject: [PATCH 05/17] gpu: host1x: Fix potential out-of-bounds access

The check for valid syncpoint IDs is off by one. While at it, rewrite
the check to make it more easily understandable.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/syncpt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 25c11a85050b..0ac026cdc30c 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -484,7 +484,7 @@ unsigned int host1x_syncpt_nb_mlocks(struct host1x *host)
 
 struct host1x_syncpt *host1x_syncpt_get(struct host1x *host, unsigned int id)
 {
-	if (host->info->nb_pts < id)
+	if (id >= host->info->nb_pts)
 		return NULL;
 
 	return host->syncpt + id;

From 404bfb78daf3bedafb0bfab24947059575cbea3d Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:14 +0200
Subject: [PATCH 06/17] gpu: host1x: Add IOMMU support

Add support for the Host1x unit to be located behind
an IOMMU. This is required when gather buffers may be
allocated non-contiguously in physical memory, as can
be the case when TegraDRM is also using the IOMMU.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c       | 74 ++++++++++++++++++++++++++-------
 drivers/gpu/host1x/cdma.h       |  6 ++-
 drivers/gpu/host1x/dev.c        | 41 +++++++++++++++++-
 drivers/gpu/host1x/dev.h        |  6 +++
 drivers/gpu/host1x/hw/cdma_hw.c | 16 ++++---
 drivers/gpu/host1x/job.c        | 72 +++++++++++++++++++++++++++-----
 drivers/gpu/host1x/job.h        |  1 +
 7 files changed, 177 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index c5d82a8a2ec9..28541b280739 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -51,9 +51,15 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
 	struct host1x_cdma *cdma = pb_to_cdma(pb);
 	struct host1x *host1x = cdma_to_host1x(cdma);
 
-	if (pb->phys != 0)
-		dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
-			    pb->phys);
+	if (!pb->phys)
+		return;
+
+	if (host1x->domain) {
+		iommu_unmap(host1x->domain, pb->dma, pb->alloc_size);
+		free_iova(&host1x->iova, iova_pfn(&host1x->iova, pb->dma));
+	}
+
+	dma_free_wc(host1x->dev, pb->alloc_size, pb->mapped, pb->phys);
 
 	pb->mapped = NULL;
 	pb->phys = 0;
@@ -66,28 +72,64 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
 {
 	struct host1x_cdma *cdma = pb_to_cdma(pb);
 	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct iova *alloc;
+	u32 size;
+	int err;
 
 	pb->mapped = NULL;
 	pb->phys = 0;
-	pb->size_bytes = HOST1X_PUSHBUFFER_SLOTS * 8;
+	pb->size = HOST1X_PUSHBUFFER_SLOTS * 8;
+
+	size = pb->size + 4;
 
 	/* initialize buffer pointers */
-	pb->fence = pb->size_bytes - 8;
+	pb->fence = pb->size - 8;
 	pb->pos = 0;
 
-	/* allocate and map pushbuffer memory */
-	pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
-				  GFP_KERNEL);
-	if (!pb->mapped)
-		goto fail;
+	if (host1x->domain) {
+		unsigned long shift;
+
+		size = iova_align(&host1x->iova, size);
+
+		pb->mapped = dma_alloc_wc(host1x->dev, size, &pb->phys,
+					  GFP_KERNEL);
+		if (!pb->mapped)
+			return -ENOMEM;
+
+		shift = iova_shift(&host1x->iova);
+		alloc = alloc_iova(&host1x->iova, size >> shift,
+				   host1x->iova_end >> shift, true);
+		if (!alloc) {
+			err = -ENOMEM;
+			goto iommu_free_mem;
+		}
+
+		pb->dma = iova_dma_addr(&host1x->iova, alloc);
+		err = iommu_map(host1x->domain, pb->dma, pb->phys, size,
+				IOMMU_READ);
+		if (err)
+			goto iommu_free_iova;
+	} else {
+		pb->mapped = dma_alloc_wc(host1x->dev, size, &pb->phys,
+					  GFP_KERNEL);
+		if (!pb->mapped)
+			return -ENOMEM;
+
+		pb->dma = pb->phys;
+	}
+
+	pb->alloc_size = size;
 
 	host1x_hw_pushbuffer_init(host1x, pb);
 
 	return 0;
 
-fail:
-	host1x_pushbuffer_destroy(pb);
-	return -ENOMEM;
+iommu_free_iova:
+	__free_iova(&host1x->iova, alloc);
+iommu_free_mem:
+	dma_free_wc(host1x->dev, pb->alloc_size, pb->mapped, pb->phys);
+
+	return err;
 }
 
 /*
@@ -101,7 +143,7 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 	WARN_ON(pb->pos == pb->fence);
 	*(p++) = op1;
 	*(p++) = op2;
-	pb->pos = (pb->pos + 8) & (pb->size_bytes - 1);
+	pb->pos = (pb->pos + 8) & (pb->size - 1);
 }
 
 /*
@@ -111,7 +153,7 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
 {
 	/* Advance the next write position */
-	pb->fence = (pb->fence + slots * 8) & (pb->size_bytes - 1);
+	pb->fence = (pb->fence + slots * 8) & (pb->size - 1);
 }
 
 /*
@@ -119,7 +161,7 @@ static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
  */
 static u32 host1x_pushbuffer_space(struct push_buffer *pb)
 {
-	return ((pb->fence - pb->pos) & (pb->size_bytes - 1)) / 8;
+	return ((pb->fence - pb->pos) & (pb->size - 1)) / 8;
 }
 
 /*
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 470087af8fe5..ec170a78f4e1 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -43,10 +43,12 @@ struct host1x_job;
 
 struct push_buffer {
 	void *mapped;			/* mapped pushbuffer memory */
-	dma_addr_t phys;		/* physical address of pushbuffer */
+	dma_addr_t dma;			/* device address of pushbuffer */
+	phys_addr_t phys;		/* physical address of pushbuffer */
 	u32 fence;			/* index we've written */
 	u32 pos;			/* index to write to */
-	u32 size_bytes;
+	u32 size;
+	u32 alloc_size;
 };
 
 struct buffer_timeout {
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index a62317af76ad..b386a0bf828a 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -27,6 +27,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/host1x.h>
+#undef CREATE_TRACE_POINTS
 
 #include "bus.h"
 #include "dev.h"
@@ -168,16 +169,37 @@ static int host1x_probe(struct platform_device *pdev)
 		return err;
 	}
 
+	if (iommu_present(&platform_bus_type)) {
+		struct iommu_domain_geometry *geometry;
+		unsigned long order;
+
+		host->domain = iommu_domain_alloc(&platform_bus_type);
+		if (!host->domain)
+			return -ENOMEM;
+
+		err = iommu_attach_device(host->domain, &pdev->dev);
+		if (err)
+			goto fail_free_domain;
+
+		geometry = &host->domain->geometry;
+
+		order = __ffs(host->domain->pgsize_bitmap);
+		init_iova_domain(&host->iova, 1UL << order,
+				 geometry->aperture_start >> order,
+				 geometry->aperture_end >> order);
+		host->iova_end = geometry->aperture_end;
+	}
+
 	err = host1x_channel_list_init(host);
 	if (err) {
 		dev_err(&pdev->dev, "failed to initialize channel list\n");
-		return err;
+		goto fail_detach_device;
 	}
 
 	err = clk_prepare_enable(host->clk);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to enable clock\n");
-		return err;
+		goto fail_detach_device;
 	}
 
 	err = host1x_syncpt_init(host);
@@ -206,6 +228,15 @@ fail_deinit_syncpt:
 	host1x_syncpt_deinit(host);
 fail_unprepare_disable:
 	clk_disable_unprepare(host->clk);
+fail_detach_device:
+	if (host->domain) {
+		put_iova_domain(&host->iova);
+		iommu_detach_device(host->domain, &pdev->dev);
+	}
+fail_free_domain:
+	if (host->domain)
+		iommu_domain_free(host->domain);
+
 	return err;
 }
 
@@ -218,6 +249,12 @@ static int host1x_remove(struct platform_device *pdev)
 	host1x_syncpt_deinit(host);
 	clk_disable_unprepare(host->clk);
 
+	if (host->domain) {
+		put_iova_domain(&host->iova);
+		iommu_detach_device(host->domain, &pdev->dev);
+		iommu_domain_free(host->domain);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 06dd4f85125f..e5113acecd7a 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -19,6 +19,8 @@
 
 #include <linux/platform_device.h>
 #include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
 
 #include "channel.h"
 #include "syncpt.h"
@@ -108,6 +110,10 @@ struct host1x {
 	struct device *dev;
 	struct clk *clk;
 
+	struct iommu_domain *domain;
+	struct iova_domain iova;
+	dma_addr_t iova_end;
+
 	struct mutex intr_mutex;
 	int intr_syncpt_irq;
 
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index 659c1bbfeeba..6b231119193e 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -30,7 +30,7 @@
  */
 static void push_buffer_init(struct push_buffer *pb)
 {
-	*(u32 *)(pb->mapped + pb->size_bytes) = host1x_opcode_restart(0);
+	*(u32 *)(pb->mapped + pb->size) = host1x_opcode_restart(0);
 }
 
 /*
@@ -55,8 +55,8 @@ static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
 		*(p++) = HOST1X_OPCODE_NOP;
 		*(p++) = HOST1X_OPCODE_NOP;
 		dev_dbg(host1x->dev, "%s: NOP at %pad+%#x\n", __func__,
-			&pb->phys, getptr);
-		getptr = (getptr + 8) & (pb->size_bytes - 1);
+			&pb->dma, getptr);
+		getptr = (getptr + 8) & (pb->size - 1);
 	}
 
 	wmb();
@@ -78,10 +78,9 @@ static void cdma_start(struct host1x_cdma *cdma)
 			 HOST1X_CHANNEL_DMACTRL);
 
 	/* set base, put and end pointer */
-	host1x_ch_writel(ch, cdma->push_buffer.phys, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
 	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
-	host1x_ch_writel(ch, cdma->push_buffer.phys +
-			 cdma->push_buffer.size_bytes + 4,
+	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size + 4,
 			 HOST1X_CHANNEL_DMAEND);
 
 	/* reset GET */
@@ -115,9 +114,8 @@ static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
 			 HOST1X_CHANNEL_DMACTRL);
 
 	/* set base, end pointer (all of memory) */
-	host1x_ch_writel(ch, cdma->push_buffer.phys, HOST1X_CHANNEL_DMASTART);
-	host1x_ch_writel(ch, cdma->push_buffer.phys +
-			 cdma->push_buffer.size_bytes,
+	host1x_ch_writel(ch, cdma->push_buffer.dma, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->push_buffer.dma + cdma->push_buffer.size,
 			 HOST1X_CHANNEL_DMAEND);
 
 	/* set GET, by loading the value in PUT (then reset GET) */
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 92c3df933303..5f5f8ee6143d 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -174,9 +174,10 @@ static int do_waitchks(struct host1x_job *job, struct host1x *host,
 	return 0;
 }
 
-static unsigned int pin_job(struct host1x_job *job)
+static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 {
 	unsigned int i;
+	int err;
 
 	job->num_unpins = 0;
 
@@ -186,12 +187,16 @@ static unsigned int pin_job(struct host1x_job *job)
 		dma_addr_t phys_addr;
 
 		reloc->target.bo = host1x_bo_get(reloc->target.bo);
-		if (!reloc->target.bo)
+		if (!reloc->target.bo) {
+			err = -EINVAL;
 			goto unpin;
+		}
 
 		phys_addr = host1x_bo_pin(reloc->target.bo, &sgt);
-		if (!phys_addr)
+		if (!phys_addr) {
+			err = -EINVAL;
 			goto unpin;
+		}
 
 		job->addr_phys[job->num_unpins] = phys_addr;
 		job->unpins[job->num_unpins].bo = reloc->target.bo;
@@ -201,28 +206,67 @@ static unsigned int pin_job(struct host1x_job *job)
 
 	for (i = 0; i < job->num_gathers; i++) {
 		struct host1x_job_gather *g = &job->gathers[i];
+		size_t gather_size = 0;
+		struct scatterlist *sg;
 		struct sg_table *sgt;
 		dma_addr_t phys_addr;
+		unsigned long shift;
+		struct iova *alloc;
+		unsigned int j;
 
 		g->bo = host1x_bo_get(g->bo);
-		if (!g->bo)
+		if (!g->bo) {
+			err = -EINVAL;
 			goto unpin;
+		}
 
 		phys_addr = host1x_bo_pin(g->bo, &sgt);
-		if (!phys_addr)
+		if (!phys_addr) {
+			err = -EINVAL;
 			goto unpin;
+		}
+
+		if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) && host->domain) {
+			for_each_sg(sgt->sgl, sg, sgt->nents, j)
+				gather_size += sg->length;
+			gather_size = iova_align(&host->iova, gather_size);
+
+			shift = iova_shift(&host->iova);
+			alloc = alloc_iova(&host->iova, gather_size >> shift,
+					   host->iova_end >> shift, true);
+			if (!alloc) {
+				err = -ENOMEM;
+				goto unpin;
+			}
+
+			err = iommu_map_sg(host->domain,
+					iova_dma_addr(&host->iova, alloc),
+					sgt->sgl, sgt->nents, IOMMU_READ);
+			if (err == 0) {
+				__free_iova(&host->iova, alloc);
+				err = -EINVAL;
+				goto unpin;
+			}
+
+			job->addr_phys[job->num_unpins] =
+				iova_dma_addr(&host->iova, alloc);
+			job->unpins[job->num_unpins].size = gather_size;
+		} else {
+			job->addr_phys[job->num_unpins] = phys_addr;
+		}
+
+		job->gather_addr_phys[i] = job->addr_phys[job->num_unpins];
 
-		job->addr_phys[job->num_unpins] = phys_addr;
 		job->unpins[job->num_unpins].bo = g->bo;
 		job->unpins[job->num_unpins].sgt = sgt;
 		job->num_unpins++;
 	}
 
-	return job->num_unpins;
+	return 0;
 
 unpin:
 	host1x_job_unpin(job);
-	return 0;
+	return err;
 }
 
 static int do_relocs(struct host1x_job *job, struct host1x_bo *cmdbuf)
@@ -525,8 +569,8 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
 		host1x_syncpt_load(host->syncpt + i);
 
 	/* pin memory */
-	err = pin_job(job);
-	if (!err)
+	err = pin_job(host, job);
+	if (err)
 		goto out;
 
 	/* patch gathers */
@@ -572,11 +616,19 @@ EXPORT_SYMBOL(host1x_job_pin);
 
 void host1x_job_unpin(struct host1x_job *job)
 {
+	struct host1x *host = dev_get_drvdata(job->channel->dev->parent);
 	unsigned int i;
 
 	for (i = 0; i < job->num_unpins; i++) {
 		struct host1x_job_unpin_data *unpin = &job->unpins[i];
 
+		if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) && host->domain) {
+			iommu_unmap(host->domain, job->addr_phys[i],
+				    unpin->size);
+			free_iova(&host->iova,
+				iova_pfn(&host->iova, job->addr_phys[i]));
+		}
+
 		host1x_bo_unpin(unpin->bo, unpin->sgt);
 		host1x_bo_put(unpin->bo);
 	}
diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
index 8b3c15df0660..878239c476d2 100644
--- a/drivers/gpu/host1x/job.h
+++ b/drivers/gpu/host1x/job.h
@@ -44,6 +44,7 @@ struct host1x_waitchk {
 struct host1x_job_unpin_data {
 	struct host1x_bo *bo;
 	struct sg_table *sgt;
+	size_t size;
 };
 
 /*

From 398cbaadecdd168fe175d559ddb1671dfa11e582 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 20 Mar 2017 20:04:20 +0100
Subject: [PATCH 07/17] drm/tegra: Enable IOVA API when IOMMU support is
 enabled

When support for an IOMMU has been enabled, make sure the IOVA helper
code is also activated and the driver can properly manage the IO virtual
address space.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/tegra/Kconfig b/drivers/gpu/drm/tegra/Kconfig
index bbf5a4b7e0b6..2db29d67193d 100644
--- a/drivers/gpu/drm/tegra/Kconfig
+++ b/drivers/gpu/drm/tegra/Kconfig
@@ -7,6 +7,7 @@ config DRM_TEGRA
 	select DRM_MIPI_DSI
 	select DRM_PANEL
 	select TEGRA_HOST1X
+	select IOMMU_IOVA if IOMMU_SUPPORT
 	help
 	  Choose this option if you have an NVIDIA Tegra SoC.
 

From 347ad49d35a1c65d509e7ef5b0760e97ede41ec2 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 9 Mar 2017 20:04:56 +0100
Subject: [PATCH 08/17] drm/tegra: Protect IOMMU operations by mutex

IOMMU support is currently not thread-safe, which can cause crashes,
amongst other things, under certain workloads.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c |  5 +++++
 drivers/gpu/drm/tegra/drm.h |  1 +
 drivers/gpu/drm/tegra/gem.c | 12 ++++++++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index ef215fef63d6..90041d5388d1 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -142,6 +142,7 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
 				 start, end);
 		drm_mm_init(&tegra->mm, start, end - start + 1);
+		mutex_init(&tegra->mm_lock);
 	}
 
 	mutex_init(&tegra->clients_lock);
@@ -208,6 +209,7 @@ config:
 	if (tegra->domain) {
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
+		mutex_destroy(&tegra->mm_lock);
 	}
 free:
 	kfree(tegra);
@@ -232,6 +234,7 @@ static void tegra_drm_unload(struct drm_device *drm)
 	if (tegra->domain) {
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
+		mutex_destroy(&tegra->mm_lock);
 	}
 
 	kfree(tegra);
@@ -878,7 +881,9 @@ static int tegra_debugfs_iova(struct seq_file *s, void *data)
 	struct tegra_drm *tegra = drm->dev_private;
 	struct drm_printer p = drm_seq_file_printer(s);
 
+	mutex_lock(&tegra->mm_lock);
 	drm_mm_print(&tegra->mm, &p);
+	mutex_unlock(&tegra->mm_lock);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 5205790dd679..d168beaf13ef 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -42,6 +42,7 @@ struct tegra_drm {
 	struct drm_device *drm;
 
 	struct iommu_domain *domain;
+	struct mutex mm_lock;
 	struct drm_mm mm;
 
 	struct mutex clients_lock;
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 17e62ecb5d4d..050ba78d3fab 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -128,12 +128,14 @@ static int tegra_bo_iommu_map(struct tegra_drm *tegra, struct tegra_bo *bo)
 	if (!bo->mm)
 		return -ENOMEM;
 
+	mutex_lock(&tegra->mm_lock);
+
 	err = drm_mm_insert_node_generic(&tegra->mm,
 					 bo->mm, bo->gem.size, PAGE_SIZE, 0, 0);
 	if (err < 0) {
 		dev_err(tegra->drm->dev, "out of I/O virtual memory: %zd\n",
 			err);
-		goto free;
+		goto unlock;
 	}
 
 	bo->paddr = bo->mm->start;
@@ -147,11 +149,14 @@ static int tegra_bo_iommu_map(struct tegra_drm *tegra, struct tegra_bo *bo)
 
 	bo->size = err;
 
+	mutex_unlock(&tegra->mm_lock);
+
 	return 0;
 
 remove:
 	drm_mm_remove_node(bo->mm);
-free:
+unlock:
+	mutex_unlock(&tegra->mm_lock);
 	kfree(bo->mm);
 	return err;
 }
@@ -161,8 +166,11 @@ static int tegra_bo_iommu_unmap(struct tegra_drm *tegra, struct tegra_bo *bo)
 	if (!bo->mm)
 		return 0;
 
+	mutex_lock(&tegra->mm_lock);
 	iommu_unmap(tegra->domain, bo->paddr, bo->size);
 	drm_mm_remove_node(bo->mm);
+	mutex_unlock(&tegra->mm_lock);
+
 	kfree(bo->mm);
 
 	return 0;

From bdd2f9cd10eb842be96418cc226bc33744d358b0 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 9 Mar 2017 20:04:55 +0100
Subject: [PATCH 09/17] drm/tegra: Don't leak kernel pointer to userspace

Each open file descriptor can have any number of contexts associated
with it. To differentiate between these contexts a unique ID is required
and back when these userspace interfaces were introduced, in commit
d43f81cbaf43 ("drm/tegra: Add gr2d device"), the pointer to the context
structure was deemed adequate. However, this leaks information about
kernel internal memory to userspace, which can potentially be exploited.

Switch the context parameter to be allocated from an IDR, which has the
added benefit of providing an easy way to look up a context from its ID.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 160 +++++++++++++++++++++++++-----------
 drivers/gpu/drm/tegra/drm.h |   2 +-
 2 files changed, 114 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 90041d5388d1..948b529d4097 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/host1x.h>
+#include <linux/idr.h>
 #include <linux/iommu.h>
 
 #include <drm/drm_atomic.h>
@@ -24,7 +25,8 @@
 #define DRIVER_PATCHLEVEL 0
 
 struct tegra_drm_file {
-	struct list_head contexts;
+	struct idr contexts;
+	struct mutex lock;
 };
 
 static void tegra_atomic_schedule(struct tegra_drm *tegra,
@@ -248,7 +250,8 @@ static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
 	if (!fpriv)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&fpriv->contexts);
+	idr_init(&fpriv->contexts);
+	mutex_init(&fpriv->lock);
 	filp->driver_priv = fpriv;
 
 	return 0;
@@ -427,21 +430,16 @@ fail:
 
 
 #ifdef CONFIG_DRM_TEGRA_STAGING
-static struct tegra_drm_context *tegra_drm_get_context(__u64 context)
+static struct tegra_drm_context *
+tegra_drm_file_get_context(struct tegra_drm_file *file, u32 id)
 {
-	return (struct tegra_drm_context *)(uintptr_t)context;
-}
+	struct tegra_drm_context *context;
 
-static bool tegra_drm_file_owns_context(struct tegra_drm_file *file,
-					struct tegra_drm_context *context)
-{
-	struct tegra_drm_context *ctx;
+	mutex_lock(&file->lock);
+	context = idr_find(&file->contexts, id);
+	mutex_unlock(&file->lock);
 
-	list_for_each_entry(ctx, &file->contexts, list)
-		if (ctx == context)
-			return true;
-
-	return false;
+	return context;
 }
 
 static int tegra_gem_create(struct drm_device *drm, void *data,
@@ -522,6 +520,28 @@ static int tegra_syncpt_wait(struct drm_device *drm, void *data,
 				  &args->value);
 }
 
+static int tegra_client_open(struct tegra_drm_file *fpriv,
+			     struct tegra_drm_client *client,
+			     struct tegra_drm_context *context)
+{
+	int err;
+
+	err = client->ops->open_channel(client, context);
+	if (err < 0)
+		return err;
+
+	err = idr_alloc(&fpriv->contexts, context, 0, 0, GFP_KERNEL);
+	if (err < 0) {
+		client->ops->close_channel(context);
+		return err;
+	}
+
+	context->client = client;
+	context->id = err;
+
+	return 0;
+}
+
 static int tegra_open_channel(struct drm_device *drm, void *data,
 			      struct drm_file *file)
 {
@@ -536,19 +556,22 @@ static int tegra_open_channel(struct drm_device *drm, void *data,
 	if (!context)
 		return -ENOMEM;
 
+	mutex_lock(&fpriv->lock);
+
 	list_for_each_entry(client, &tegra->clients, list)
 		if (client->base.class == args->client) {
-			err = client->ops->open_channel(client, context);
-			if (err)
+			err = tegra_client_open(fpriv, client, context);
+			if (err < 0)
 				break;
 
-			list_add(&context->list, &fpriv->contexts);
-			args->context = (uintptr_t)context;
-			context->client = client;
-			return 0;
+			args->context = context->id;
+			break;
 		}
 
-	kfree(context);
+	if (err < 0)
+		kfree(context);
+
+	mutex_unlock(&fpriv->lock);
 	return err;
 }
 
@@ -558,16 +581,22 @@ static int tegra_close_channel(struct drm_device *drm, void *data,
 	struct tegra_drm_file *fpriv = file->driver_priv;
 	struct drm_tegra_close_channel *args = data;
 	struct tegra_drm_context *context;
+	int err = 0;
 
-	context = tegra_drm_get_context(args->context);
+	mutex_lock(&fpriv->lock);
 
-	if (!tegra_drm_file_owns_context(fpriv, context))
-		return -EINVAL;
+	context = tegra_drm_file_get_context(fpriv, args->context);
+	if (!context) {
+		err = -EINVAL;
+		goto unlock;
+	}
 
-	list_del(&context->list);
+	idr_remove(&fpriv->contexts, context->id);
 	tegra_drm_context_free(context);
 
-	return 0;
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
 }
 
 static int tegra_get_syncpt(struct drm_device *drm, void *data,
@@ -577,19 +606,27 @@ static int tegra_get_syncpt(struct drm_device *drm, void *data,
 	struct drm_tegra_get_syncpt *args = data;
 	struct tegra_drm_context *context;
 	struct host1x_syncpt *syncpt;
+	int err = 0;
 
-	context = tegra_drm_get_context(args->context);
+	mutex_lock(&fpriv->lock);
 
-	if (!tegra_drm_file_owns_context(fpriv, context))
-		return -ENODEV;
+	context = tegra_drm_file_get_context(fpriv, args->context);
+	if (!context) {
+		err = -ENODEV;
+		goto unlock;
+	}
 
-	if (args->index >= context->client->base.num_syncpts)
-		return -EINVAL;
+	if (args->index >= context->client->base.num_syncpts) {
+		err = -EINVAL;
+		goto unlock;
+	}
 
 	syncpt = context->client->base.syncpts[args->index];
 	args->id = host1x_syncpt_id(syncpt);
 
-	return 0;
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
 }
 
 static int tegra_submit(struct drm_device *drm, void *data,
@@ -598,13 +635,21 @@ static int tegra_submit(struct drm_device *drm, void *data,
 	struct tegra_drm_file *fpriv = file->driver_priv;
 	struct drm_tegra_submit *args = data;
 	struct tegra_drm_context *context;
+	int err;
 
-	context = tegra_drm_get_context(args->context);
+	mutex_lock(&fpriv->lock);
 
-	if (!tegra_drm_file_owns_context(fpriv, context))
-		return -ENODEV;
+	context = tegra_drm_file_get_context(fpriv, args->context);
+	if (!context) {
+		err = -ENODEV;
+		goto unlock;
+	}
 
-	return context->client->ops->submit(context, args, drm, file);
+	err = context->client->ops->submit(context, args, drm, file);
+
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
 }
 
 static int tegra_get_syncpt_base(struct drm_device *drm, void *data,
@@ -615,24 +660,34 @@ static int tegra_get_syncpt_base(struct drm_device *drm, void *data,
 	struct tegra_drm_context *context;
 	struct host1x_syncpt_base *base;
 	struct host1x_syncpt *syncpt;
+	int err = 0;
 
-	context = tegra_drm_get_context(args->context);
+	mutex_lock(&fpriv->lock);
 
-	if (!tegra_drm_file_owns_context(fpriv, context))
-		return -ENODEV;
+	context = tegra_drm_file_get_context(fpriv, args->context);
+	if (!context) {
+		err = -ENODEV;
+		goto unlock;
+	}
 
-	if (args->syncpt >= context->client->base.num_syncpts)
-		return -EINVAL;
+	if (args->syncpt >= context->client->base.num_syncpts) {
+		err = -EINVAL;
+		goto unlock;
+	}
 
 	syncpt = context->client->base.syncpts[args->syncpt];
 
 	base = host1x_syncpt_get_base(syncpt);
-	if (!base)
-		return -ENXIO;
+	if (!base) {
+		err = -ENXIO;
+		goto unlock;
+	}
 
 	args->id = host1x_syncpt_base_id(base);
 
-	return 0;
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
 }
 
 static int tegra_gem_set_tiling(struct drm_device *drm, void *data,
@@ -841,14 +896,25 @@ static void tegra_drm_disable_vblank(struct drm_device *drm, unsigned int pipe)
 		tegra_dc_disable_vblank(dc);
 }
 
+static int tegra_drm_context_cleanup(int id, void *p, void *data)
+{
+	struct tegra_drm_context *context = p;
+
+	tegra_drm_context_free(context);
+
+	return 0;
+}
+
 static void tegra_drm_preclose(struct drm_device *drm, struct drm_file *file)
 {
 	struct tegra_drm_file *fpriv = file->driver_priv;
-	struct tegra_drm_context *context, *tmp;
 
-	list_for_each_entry_safe(context, tmp, &fpriv->contexts, list)
-		tegra_drm_context_free(context);
+	mutex_lock(&fpriv->lock);
+	idr_for_each(&fpriv->contexts, tegra_drm_context_cleanup, NULL);
+	mutex_unlock(&fpriv->lock);
 
+	idr_destroy(&fpriv->contexts);
+	mutex_destroy(&fpriv->lock);
 	kfree(fpriv);
 }
 
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index d168beaf13ef..368dde1bed18 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -68,7 +68,7 @@ struct tegra_drm_client;
 struct tegra_drm_context {
 	struct tegra_drm_client *client;
 	struct host1x_channel *channel;
-	struct list_head list;
+	unsigned int id;
 };
 
 struct tegra_drm_client_ops {

From 5e91144dd702d068b22a75911c06104e56cb4858 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Tue, 8 Nov 2016 16:50:42 +0900
Subject: [PATCH 10/17] drm/tegra: Add tiling FB modifiers

Add FB modifiers to allow user-space to specify that a surface is in one
of the two tiling formats supported by Tegra chips, and add support in
the tegradrm driver to handle them properly. This is necessary for the
display controller to directly display buffers generated by the GPU.

This feature is intended to replace the dedicated IOCTL enabled
by TEGRA_STAGING and to provide a non-staging alternative to that
solution.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c   |  2 ++
 drivers/gpu/drm/tegra/fb.c    | 21 ++++++++++++++--
 include/uapi/drm/drm_fourcc.h | 45 +++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 948b529d4097..3f8bd7bd6532 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -164,6 +164,8 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 	drm->mode_config.max_width = 4096;
 	drm->mode_config.max_height = 4096;
 
+	drm->mode_config.allow_fb_modifiers = true;
+
 	drm->mode_config.funcs = &tegra_drm_mode_funcs;
 
 	err = tegra_drm_fb_prepare(drm);
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c
index f142f6a4db25..d53f49f60b6f 100644
--- a/drivers/gpu/drm/tegra/fb.c
+++ b/drivers/gpu/drm/tegra/fb.c
@@ -52,9 +52,26 @@ int tegra_fb_get_tiling(struct drm_framebuffer *framebuffer,
 			struct tegra_bo_tiling *tiling)
 {
 	struct tegra_fb *fb = to_tegra_fb(framebuffer);
+	uint64_t modifier = fb->base.modifier;
 
-	/* TODO: handle YUV formats? */
-	*tiling = fb->planes[0]->tiling;
+	switch (fourcc_mod_tegra_mod(modifier)) {
+	case NV_FORMAT_MOD_TEGRA_TILED:
+		tiling->mode = TEGRA_BO_TILING_MODE_TILED;
+		tiling->value = 0;
+		break;
+
+	case NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(0):
+		tiling->mode = TEGRA_BO_TILING_MODE_BLOCK;
+		tiling->value = fourcc_mod_tegra_param(modifier);
+		if (tiling->value > 5)
+			return -EINVAL;
+		break;
+
+	default:
+		/* TODO: handle YUV formats? */
+		*tiling = fb->planes[0]->tiling;
+		break;
+	}
 
 	return 0;
 }
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index ef20abb8119b..983a4f3ba5e1 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -292,6 +292,51 @@ extern "C" {
  */
 #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4)
 
+
+/* NVIDIA Tegra frame buffer modifiers */
+
+/*
+ * Some modifiers take parameters, for example the number of vertical GOBs in
+ * a block. Reserve the lower 32 bits for parameters
+ */
+#define __fourcc_mod_tegra_mode_shift 32
+#define fourcc_mod_tegra_code(val, params) \
+	fourcc_mod_code(NV, ((((__u64)val) << __fourcc_mod_tegra_mode_shift) | params))
+#define fourcc_mod_tegra_mod(m) \
+	(m & ~((1ULL << __fourcc_mod_tegra_mode_shift) - 1))
+#define fourcc_mod_tegra_param(m) \
+	(m & ((1ULL << __fourcc_mod_tegra_mode_shift) - 1))
+
+/*
+ * Tegra Tiled Layout, used by Tegra 2, 3 and 4.
+ *
+ * Pixels are arranged in simple tiles of 16 x 16 bytes.
+ */
+#define NV_FORMAT_MOD_TEGRA_TILED fourcc_mod_tegra_code(1, 0)
+
+/*
+ * Tegra 16Bx2 Block Linear layout, used by TK1/TX1
+ *
+ * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked
+ * vertically by a power of 2 (1 to 32 GOBs) to form a block.
+ *
+ * Within a GOB, data is ordered as 16B x 2 lines sectors laid in Z-shape.
+ *
+ * Parameter 'v' is the log2 encoding of the number of GOBs stacked vertically.
+ * Valid values are:
+ *
+ * 0 == ONE_GOB
+ * 1 == TWO_GOBS
+ * 2 == FOUR_GOBS
+ * 3 == EIGHT_GOBS
+ * 4 == SIXTEEN_GOBS
+ * 5 == THIRTYTWO_GOBS
+ *
+ * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format
+ * in full detail.
+ */
+#define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v)
+
 #if defined(__cplusplus)
 }
 #endif

From ad92601521ea1928e702dc709384d21e96202155 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:11 +0200
Subject: [PATCH 11/17] drm/tegra: Add Tegra DRM allocation API

Add a new IO virtual memory allocation API to allow clients to
allocate non-GEM memory in the Tegra DRM IOMMU domain. This is
required e.g. for loading client firmware when clients are attached
to the IOMMU domain.

The allocator allocates contiguous physical pages that are then
mapped contiguously to the IOMMU domain using the iova_domain
library provided by the kernel. Contiguous physical pages are
used so that the same allocator works also when IOMMU support
is disabled and therefore devices access physical memory directly.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/drm.c | 111 +++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/tegra/drm.h |  11 ++++
 2 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 3f8bd7bd6532..d9211cb78da5 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1,12 +1,13 @@
 /*
  * Copyright (C) 2012 Avionic Design GmbH
- * Copyright (C) 2012-2013 NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (C) 2012-2016 NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 
+#include <linux/bitops.h>
 #include <linux/host1x.h>
 #include <linux/idr.h>
 #include <linux/iommu.h>
@@ -24,6 +25,8 @@
 #define DRIVER_MINOR 0
 #define DRIVER_PATCHLEVEL 0
 
+#define CARVEOUT_SZ SZ_64M
+
 struct tegra_drm_file {
 	struct idr contexts;
 	struct mutex lock;
@@ -128,8 +131,9 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		return -ENOMEM;
 
 	if (iommu_present(&platform_bus_type)) {
+		u64 carveout_start, carveout_end, gem_start, gem_end;
 		struct iommu_domain_geometry *geometry;
-		u64 start, end;
+		unsigned long order;
 
 		tegra->domain = iommu_domain_alloc(&platform_bus_type);
 		if (!tegra->domain) {
@@ -138,13 +142,26 @@ static int tegra_drm_load(struct drm_device *drm, unsigned long flags)
 		}
 
 		geometry = &tegra->domain->geometry;
-		start = geometry->aperture_start;
-		end = geometry->aperture_end;
+		gem_start = geometry->aperture_start;
+		gem_end = geometry->aperture_end - CARVEOUT_SZ;
+		carveout_start = gem_end + 1;
+		carveout_end = geometry->aperture_end;
 
-		DRM_DEBUG_DRIVER("IOMMU aperture initialized (%#llx-%#llx)\n",
-				 start, end);
-		drm_mm_init(&tegra->mm, start, end - start + 1);
+		order = __ffs(tegra->domain->pgsize_bitmap);
+		init_iova_domain(&tegra->carveout.domain, 1UL << order,
+				 carveout_start >> order,
+				 carveout_end >> order);
+
+		tegra->carveout.shift = iova_shift(&tegra->carveout.domain);
+		tegra->carveout.limit = carveout_end >> tegra->carveout.shift;
+
+		drm_mm_init(&tegra->mm, gem_start, gem_end - gem_start + 1);
 		mutex_init(&tegra->mm_lock);
+
+		DRM_DEBUG("IOMMU apertures:\n");
+		DRM_DEBUG("  GEM: %#llx-%#llx\n", gem_start, gem_end);
+		DRM_DEBUG("  Carveout: %#llx-%#llx\n", carveout_start,
+			  carveout_end);
 	}
 
 	mutex_init(&tegra->clients_lock);
@@ -214,6 +231,7 @@ config:
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
 		mutex_destroy(&tegra->mm_lock);
+		put_iova_domain(&tegra->carveout.domain);
 	}
 free:
 	kfree(tegra);
@@ -239,6 +257,7 @@ static void tegra_drm_unload(struct drm_device *drm)
 		iommu_domain_free(tegra->domain);
 		drm_mm_takedown(&tegra->mm);
 		mutex_destroy(&tegra->mm_lock);
+		put_iova_domain(&tegra->carveout.domain);
 	}
 
 	kfree(tegra);
@@ -1030,6 +1049,84 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
 	return 0;
 }
 
+void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size,
+			      dma_addr_t *dma)
+{
+	struct iova *alloc;
+	void *virt;
+	gfp_t gfp;
+	int err;
+
+	if (tegra->domain)
+		size = iova_align(&tegra->carveout.domain, size);
+	else
+		size = PAGE_ALIGN(size);
+
+	gfp = GFP_KERNEL | __GFP_ZERO;
+	if (!tegra->domain) {
+		/*
+		 * Many units only support 32-bit addresses, even on 64-bit
+		 * SoCs. If there is no IOMMU to translate into a 32-bit IO
+		 * virtual address space, force allocations to be in the
+		 * lower 32-bit range.
+		 */
+		gfp |= GFP_DMA;
+	}
+
+	virt = (void *)__get_free_pages(gfp, get_order(size));
+	if (!virt)
+		return ERR_PTR(-ENOMEM);
+
+	if (!tegra->domain) {
+		/*
+		 * If IOMMU is disabled, devices address physical memory
+		 * directly.
+		 */
+		*dma = virt_to_phys(virt);
+		return virt;
+	}
+
+	alloc = alloc_iova(&tegra->carveout.domain,
+			   size >> tegra->carveout.shift,
+			   tegra->carveout.limit, true);
+	if (!alloc) {
+		err = -EBUSY;
+		goto free_pages;
+	}
+
+	*dma = iova_dma_addr(&tegra->carveout.domain, alloc);
+	err = iommu_map(tegra->domain, *dma, virt_to_phys(virt),
+			size, IOMMU_READ | IOMMU_WRITE);
+	if (err < 0)
+		goto free_iova;
+
+	return virt;
+
+free_iova:
+	__free_iova(&tegra->carveout.domain, alloc);
+free_pages:
+	free_pages((unsigned long)virt, get_order(size));
+
+	return ERR_PTR(err);
+}
+
+void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
+		    dma_addr_t dma)
+{
+	if (tegra->domain)
+		size = iova_align(&tegra->carveout.domain, size);
+	else
+		size = PAGE_ALIGN(size);
+
+	if (tegra->domain) {
+		iommu_unmap(tegra->domain, dma, size);
+		free_iova(&tegra->carveout.domain,
+			  iova_pfn(&tegra->carveout.domain, dma));
+	}
+
+	free_pages((unsigned long)virt, get_order(size));
+}
+
 static int host1x_drm_probe(struct host1x_device *dev)
 {
 	struct drm_driver *driver = &tegra_drm_driver;
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 368dde1bed18..668ccfb5cb05 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -12,6 +12,7 @@
 
 #include <uapi/drm/tegra_drm.h>
 #include <linux/host1x.h>
+#include <linux/iova.h>
 #include <linux/of_gpio.h>
 
 #include <drm/drmP.h>
@@ -45,6 +46,12 @@ struct tegra_drm {
 	struct mutex mm_lock;
 	struct drm_mm mm;
 
+	struct {
+		struct iova_domain domain;
+		unsigned long shift;
+		unsigned long limit;
+	} carveout;
+
 	struct mutex clients_lock;
 	struct list_head clients;
 
@@ -106,6 +113,10 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
 int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm);
 int tegra_drm_exit(struct tegra_drm *tegra);
 
+void *tegra_drm_alloc(struct tegra_drm *tegra, size_t size, dma_addr_t *iova);
+void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
+		    dma_addr_t iova);
+
 struct tegra_dc_soc_info;
 struct tegra_output;
 

From 8746f6593d42d67e7cb7ef1f66003738a7357cc8 Mon Sep 17 00:00:00 2001
From: Arto Merilainen <amerilainen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:12 +0200
Subject: [PATCH 12/17] drm/tegra: Add falcon helper library

Add a set of falcon helper routines for use by the tegradrm client drivers
of the various falcon-based engines.

The falcon is a microcontroller that acts as a frontend for the rest of a
particular Tegra engine.  In order to properly utilize these engines, the
frontend must be booted before pushing any commands.

Based on work by Andrew Chew <achew@nvidia.com>

Signed-off-by: Andrew Chew <achew@nvidia.com>
Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/Makefile |   3 +-
 drivers/gpu/drm/tegra/falcon.c | 259 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/tegra/falcon.h | 127 ++++++++++++++++
 3 files changed, 388 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/tegra/falcon.c
 create mode 100644 drivers/gpu/drm/tegra/falcon.h

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 2c66a8db9da4..93e9a4a1e3d1 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -13,6 +13,7 @@ tegra-drm-y := \
 	sor.o \
 	dpaux.o \
 	gr2d.o \
-	gr3d.o
+	gr3d.o \
+	falcon.o
 
 obj-$(CONFIG_DRM_TEGRA) += tegra-drm.o
diff --git a/drivers/gpu/drm/tegra/falcon.c b/drivers/gpu/drm/tegra/falcon.c
new file mode 100644
index 000000000000..f685e72949d1
--- /dev/null
+++ b/drivers/gpu/drm/tegra/falcon.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/iopoll.h>
+
+#include "falcon.h"
+#include "drm.h"
+
+enum falcon_memory {
+	FALCON_MEMORY_IMEM,
+	FALCON_MEMORY_DATA,
+};
+
+static void falcon_writel(struct falcon *falcon, u32 value, u32 offset)
+{
+	writel(value, falcon->regs + offset);
+}
+
+int falcon_wait_idle(struct falcon *falcon)
+{
+	u32 value;
+
+	return readl_poll_timeout(falcon->regs + FALCON_IDLESTATE, value,
+				  (value == 0), 10, 100000);
+}
+
+static int falcon_dma_wait_idle(struct falcon *falcon)
+{
+	u32 value;
+
+	return readl_poll_timeout(falcon->regs + FALCON_DMATRFCMD, value,
+				  (value & FALCON_DMATRFCMD_IDLE), 10, 100000);
+}
+
+static int falcon_copy_chunk(struct falcon *falcon,
+			     phys_addr_t base,
+			     unsigned long offset,
+			     enum falcon_memory target)
+{
+	u32 cmd = FALCON_DMATRFCMD_SIZE_256B;
+
+	if (target == FALCON_MEMORY_IMEM)
+		cmd |= FALCON_DMATRFCMD_IMEM;
+
+	falcon_writel(falcon, offset, FALCON_DMATRFMOFFS);
+	falcon_writel(falcon, base, FALCON_DMATRFFBOFFS);
+	falcon_writel(falcon, cmd, FALCON_DMATRFCMD);
+
+	return falcon_dma_wait_idle(falcon);
+}
+
+static void falcon_copy_firmware_image(struct falcon *falcon,
+				       const struct firmware *firmware)
+{
+	u32 *firmware_vaddr = falcon->firmware.vaddr;
+	dma_addr_t daddr;
+	size_t i;
+	int err;
+
+	/* copy the whole thing taking into account endianness */
+	for (i = 0; i < firmware->size / sizeof(u32); i++)
+		firmware_vaddr[i] = le32_to_cpu(((u32 *)firmware->data)[i]);
+
+	/* ensure that caches are flushed and falcon can see the firmware */
+	daddr = dma_map_single(falcon->dev, firmware_vaddr,
+			       falcon->firmware.size, DMA_TO_DEVICE);
+	err = dma_mapping_error(falcon->dev, daddr);
+	if (err) {
+		dev_err(falcon->dev, "failed to map firmware: %d\n", err);
+		return;
+	}
+	dma_sync_single_for_device(falcon->dev, daddr,
+				   falcon->firmware.size, DMA_TO_DEVICE);
+	dma_unmap_single(falcon->dev, daddr, falcon->firmware.size,
+			 DMA_TO_DEVICE);
+}
+
+static int falcon_parse_firmware_image(struct falcon *falcon)
+{
+	struct falcon_fw_bin_header_v1 *bin = (void *)falcon->firmware.vaddr;
+	struct falcon_fw_os_header_v1 *os;
+
+	/* endian problems would show up right here */
+	if (bin->magic != PCI_VENDOR_ID_NVIDIA) {
+		dev_err(falcon->dev, "incorrect firmware magic\n");
+		return -EINVAL;
+	}
+
+	/* currently only version 1 is supported */
+	if (bin->version != 1) {
+		dev_err(falcon->dev, "unsupported firmware version\n");
+		return -EINVAL;
+	}
+
+	/* check that the firmware size is consistent */
+	if (bin->size > falcon->firmware.size) {
+		dev_err(falcon->dev, "firmware image size inconsistency\n");
+		return -EINVAL;
+	}
+
+	os = falcon->firmware.vaddr + bin->os_header_offset;
+
+	falcon->firmware.bin_data.size = bin->os_size;
+	falcon->firmware.bin_data.offset = bin->os_data_offset;
+	falcon->firmware.code.offset = os->code_offset;
+	falcon->firmware.code.size = os->code_size;
+	falcon->firmware.data.offset = os->data_offset;
+	falcon->firmware.data.size = os->data_size;
+
+	return 0;
+}
+
+int falcon_read_firmware(struct falcon *falcon, const char *name)
+{
+	int err;
+
+	/* request_firmware prints error if it fails */
+	err = request_firmware(&falcon->firmware.firmware, name, falcon->dev);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+int falcon_load_firmware(struct falcon *falcon)
+{
+	const struct firmware *firmware = falcon->firmware.firmware;
+	int err;
+
+	falcon->firmware.size = firmware->size;
+
+	/* allocate iova space for the firmware */
+	falcon->firmware.vaddr = falcon->ops->alloc(falcon, firmware->size,
+						    &falcon->firmware.paddr);
+	if (!falcon->firmware.vaddr) {
+		dev_err(falcon->dev, "dma memory mapping failed\n");
+		return -ENOMEM;
+	}
+
+	/* copy firmware image into local area. this also ensures endianness */
+	falcon_copy_firmware_image(falcon, firmware);
+
+	/* parse the image data */
+	err = falcon_parse_firmware_image(falcon);
+	if (err < 0) {
+		dev_err(falcon->dev, "failed to parse firmware image\n");
+		goto err_setup_firmware_image;
+	}
+
+	release_firmware(firmware);
+	falcon->firmware.firmware = NULL;
+
+	return 0;
+
+err_setup_firmware_image:
+	falcon->ops->free(falcon, falcon->firmware.size,
+			  falcon->firmware.paddr, falcon->firmware.vaddr);
+
+	return err;
+}
+
+int falcon_init(struct falcon *falcon)
+{
+	/* check mandatory ops */
+	if (!falcon->ops || !falcon->ops->alloc || !falcon->ops->free)
+		return -EINVAL;
+
+	falcon->firmware.vaddr = NULL;
+
+	return 0;
+}
+
+void falcon_exit(struct falcon *falcon)
+{
+	if (falcon->firmware.firmware) {
+		release_firmware(falcon->firmware.firmware);
+		falcon->firmware.firmware = NULL;
+	}
+
+	if (falcon->firmware.vaddr) {
+		falcon->ops->free(falcon, falcon->firmware.size,
+				  falcon->firmware.paddr,
+				  falcon->firmware.vaddr);
+		falcon->firmware.vaddr = NULL;
+	}
+}
+
+int falcon_boot(struct falcon *falcon)
+{
+	unsigned long offset;
+	int err;
+
+	if (!falcon->firmware.vaddr)
+		return -EINVAL;
+
+	falcon_writel(falcon, 0, FALCON_DMACTL);
+
+	/* setup the address of the binary data so Falcon can access it later */
+	falcon_writel(falcon, (falcon->firmware.paddr +
+			       falcon->firmware.bin_data.offset) >> 8,
+		      FALCON_DMATRFBASE);
+
+	/* copy the data segment into Falcon internal memory */
+	for (offset = 0; offset < falcon->firmware.data.size; offset += 256)
+		falcon_copy_chunk(falcon,
+				  falcon->firmware.data.offset + offset,
+				  offset, FALCON_MEMORY_DATA);
+
+	/* copy the first code segment into Falcon internal memory */
+	falcon_copy_chunk(falcon, falcon->firmware.code.offset,
+			  0, FALCON_MEMORY_IMEM);
+
+	/* setup falcon interrupts */
+	falcon_writel(falcon, FALCON_IRQMSET_EXT(0xff) |
+			      FALCON_IRQMSET_SWGEN1 |
+			      FALCON_IRQMSET_SWGEN0 |
+			      FALCON_IRQMSET_EXTERR |
+			      FALCON_IRQMSET_HALT |
+			      FALCON_IRQMSET_WDTMR,
+		      FALCON_IRQMSET);
+	falcon_writel(falcon, FALCON_IRQDEST_EXT(0xff) |
+			      FALCON_IRQDEST_SWGEN1 |
+			      FALCON_IRQDEST_SWGEN0 |
+			      FALCON_IRQDEST_EXTERR |
+			      FALCON_IRQDEST_HALT,
+		      FALCON_IRQDEST);
+
+	/* enable interface */
+	falcon_writel(falcon, FALCON_ITFEN_MTHDEN |
+			      FALCON_ITFEN_CTXEN,
+		      FALCON_ITFEN);
+
+	/* boot falcon */
+	falcon_writel(falcon, 0x00000000, FALCON_BOOTVEC);
+	falcon_writel(falcon, FALCON_CPUCTL_STARTCPU, FALCON_CPUCTL);
+
+	err = falcon_wait_idle(falcon);
+	if (err < 0) {
+		dev_err(falcon->dev, "Falcon boot failed due to timeout\n");
+		return err;
+	}
+
+	return 0;
+}
+
+void falcon_execute_method(struct falcon *falcon, u32 method, u32 data)
+{
+	falcon_writel(falcon, method >> 2, FALCON_UCLASS_METHOD_OFFSET);
+	falcon_writel(falcon, data, FALCON_UCLASS_METHOD_DATA);
+}
diff --git a/drivers/gpu/drm/tegra/falcon.h b/drivers/gpu/drm/tegra/falcon.h
new file mode 100644
index 000000000000..4504ed5a199e
--- /dev/null
+++ b/drivers/gpu/drm/tegra/falcon.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _FALCON_H_
+#define _FALCON_H_
+
+#include <linux/types.h>
+
+#define FALCON_UCLASS_METHOD_OFFSET		0x00000040
+
+#define FALCON_UCLASS_METHOD_DATA		0x00000044
+
+#define FALCON_IRQMSET				0x00001010
+#define FALCON_IRQMSET_WDTMR			(1 << 1)
+#define FALCON_IRQMSET_HALT			(1 << 4)
+#define FALCON_IRQMSET_EXTERR			(1 << 5)
+#define FALCON_IRQMSET_SWGEN0			(1 << 6)
+#define FALCON_IRQMSET_SWGEN1			(1 << 7)
+#define FALCON_IRQMSET_EXT(v)			(((v) & 0xff) << 8)
+
+#define FALCON_IRQDEST				0x0000101c
+#define FALCON_IRQDEST_HALT			(1 << 4)
+#define FALCON_IRQDEST_EXTERR			(1 << 5)
+#define FALCON_IRQDEST_SWGEN0			(1 << 6)
+#define FALCON_IRQDEST_SWGEN1			(1 << 7)
+#define FALCON_IRQDEST_EXT(v)			(((v) & 0xff) << 8)
+
+#define FALCON_ITFEN				0x00001048
+#define FALCON_ITFEN_CTXEN			(1 << 0)
+#define FALCON_ITFEN_MTHDEN			(1 << 1)
+
+#define FALCON_IDLESTATE			0x0000104c
+
+#define FALCON_CPUCTL				0x00001100
+#define FALCON_CPUCTL_STARTCPU			(1 << 1)
+
+#define FALCON_BOOTVEC				0x00001104
+
+#define FALCON_DMACTL				0x0000110c
+#define FALCON_DMACTL_DMEM_SCRUBBING		(1 << 1)
+#define FALCON_DMACTL_IMEM_SCRUBBING		(1 << 2)
+
+#define FALCON_DMATRFBASE			0x00001110
+
+#define FALCON_DMATRFMOFFS			0x00001114
+
+#define FALCON_DMATRFCMD			0x00001118
+#define FALCON_DMATRFCMD_IDLE			(1 << 1)
+#define FALCON_DMATRFCMD_IMEM			(1 << 4)
+#define FALCON_DMATRFCMD_SIZE_256B		(6 << 8)
+
+#define FALCON_DMATRFFBOFFS			0x0000111c
+
+struct falcon_fw_bin_header_v1 {
+	u32 magic;		/* 0x10de */
+	u32 version;		/* version of bin format (1) */
+	u32 size;		/* entire image size including this header */
+	u32 os_header_offset;
+	u32 os_data_offset;
+	u32 os_size;
+};
+
+struct falcon_fw_os_app_v1 {
+	u32 offset;
+	u32 size;
+};
+
+struct falcon_fw_os_header_v1 {
+	u32 code_offset;
+	u32 code_size;
+	u32 data_offset;
+	u32 data_size;
+};
+
+struct falcon;
+
+struct falcon_ops {
+	void *(*alloc)(struct falcon *falcon, size_t size,
+		       dma_addr_t *paddr);
+	void (*free)(struct falcon *falcon, size_t size,
+		     dma_addr_t paddr, void *vaddr);
+};
+
+struct falcon_firmware_section {
+	unsigned long offset;
+	size_t size;
+};
+
+struct falcon_firmware {
+	/* Firmware after it is read but not loaded */
+	const struct firmware *firmware;
+
+	/* Raw firmware data */
+	dma_addr_t paddr;
+	void *vaddr;
+	size_t size;
+
+	/* Parsed firmware information */
+	struct falcon_firmware_section bin_data;
+	struct falcon_firmware_section data;
+	struct falcon_firmware_section code;
+};
+
+struct falcon {
+	/* Set by falcon client */
+	struct device *dev;
+	void __iomem *regs;
+	const struct falcon_ops *ops;
+	void *data;
+
+	struct falcon_firmware firmware;
+};
+
+int falcon_init(struct falcon *falcon);
+void falcon_exit(struct falcon *falcon);
+int falcon_read_firmware(struct falcon *falcon, const char *firmware_name);
+int falcon_load_firmware(struct falcon *falcon);
+int falcon_boot(struct falcon *falcon);
+void falcon_execute_method(struct falcon *falcon, u32 method, u32 data);
+int falcon_wait_idle(struct falcon *falcon);
+
+#endif /* _FALCON_H_ */

From 6cc5f00f34c4c1a4891f976b71c0fb1c54a820ab Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:15 +0200
Subject: [PATCH 13/17] dt-bindings: Add bindings for the Tegra VIC

The VIC (Video Image Compositor) is a Host1x client unit
that can do various 2D composition and transform operations.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 .../display/tegra/nvidia,tegra20-host1x.txt         | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
index 0fad7ed2ea19..74e1e8add5a1 100644
--- a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
@@ -249,6 +249,19 @@ of the following host1x client modules:
   See ../pinctrl/nvidia,tegra124-dpaux-padctl.txt for information
   regarding the DPAUX pad controller bindings.
 
+- vic: Video Image Compositor
+  - compatible : "nvidia,tegra<chip>-vic"
+  - reg: Physical base address and length of the controller's registers.
+  - interrupts: The interrupt outputs from the controller.
+  - clocks: Must contain an entry for each entry in clock-names.
+    See ../clocks/clock-bindings.txt for details.
+  - clock-names: Must include the following entries:
+    - vic: clock input for the VIC hardware
+  - resets: Must contain an entry for each entry in reset-names.
+    See ../reset/reset.txt for details.
+  - reset-names: Must include the following entries:
+    - vic
+
 Example:
 
 / {

From 0ae797a8ba05a2354db5e81c1d7df04671dd1c25 Mon Sep 17 00:00:00 2001
From: Arto Merilainen <amerilainen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:13 +0200
Subject: [PATCH 14/17] drm/tegra: Add VIC support

This patch adds support for Video Image Compositor engine which
can be used for 2d operations.

Signed-off-by: Andrew Chew <achew@nvidia.com>
Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/Makefile |   3 +-
 drivers/gpu/drm/tegra/drm.c    |   3 +
 drivers/gpu/drm/tegra/drm.h    |   1 +
 drivers/gpu/drm/tegra/vic.c    | 396 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/tegra/vic.h    |  31 +++
 include/linux/host1x.h         |   1 +
 6 files changed, 434 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/tegra/vic.c
 create mode 100644 drivers/gpu/drm/tegra/vic.h

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 93e9a4a1e3d1..6af3a9ad6565 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -14,6 +14,7 @@ tegra-drm-y := \
 	dpaux.o \
 	gr2d.o \
 	gr3d.o \
-	falcon.o
+	falcon.o \
+	vic.o
 
 obj-$(CONFIG_DRM_TEGRA) += tegra-drm.o
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index d9211cb78da5..bbce47dc79e6 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1211,11 +1211,13 @@ static const struct of_device_id host1x_drm_subdevs[] = {
 	{ .compatible = "nvidia,tegra124-sor", },
 	{ .compatible = "nvidia,tegra124-hdmi", },
 	{ .compatible = "nvidia,tegra124-dsi", },
+	{ .compatible = "nvidia,tegra124-vic", },
 	{ .compatible = "nvidia,tegra132-dsi", },
 	{ .compatible = "nvidia,tegra210-dc", },
 	{ .compatible = "nvidia,tegra210-dsi", },
 	{ .compatible = "nvidia,tegra210-sor", },
 	{ .compatible = "nvidia,tegra210-sor1", },
+	{ .compatible = "nvidia,tegra210-vic", },
 	{ /* sentinel */ }
 };
 
@@ -1237,6 +1239,7 @@ static struct platform_driver * const drivers[] = {
 	&tegra_sor_driver,
 	&tegra_gr2d_driver,
 	&tegra_gr3d_driver,
+	&tegra_vic_driver,
 };
 
 static int __init host1x_drm_init(void)
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 668ccfb5cb05..9d6f4ffcc786 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -298,5 +298,6 @@ extern struct platform_driver tegra_dpaux_driver;
 extern struct platform_driver tegra_sor_driver;
 extern struct platform_driver tegra_gr2d_driver;
 extern struct platform_driver tegra_gr3d_driver;
+extern struct platform_driver tegra_vic_driver;
 
 #endif /* HOST1X_DRM_H */
diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
new file mode 100644
index 000000000000..cd804e404a11
--- /dev/null
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/host1x.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/reset.h>
+
+#include <soc/tegra/pmc.h>
+
+#include "drm.h"
+#include "falcon.h"
+#include "vic.h"
+
+struct vic_config {
+	const char *firmware;
+};
+
+struct vic {
+	struct falcon falcon;
+	bool booted;
+
+	void __iomem *regs;
+	struct tegra_drm_client client;
+	struct host1x_channel *channel;
+	struct iommu_domain *domain;
+	struct device *dev;
+	struct clk *clk;
+
+	/* Platform configuration */
+	const struct vic_config *config;
+};
+
+static inline struct vic *to_vic(struct tegra_drm_client *client)
+{
+	return container_of(client, struct vic, client);
+}
+
+static void vic_writel(struct vic *vic, u32 value, unsigned int offset)
+{
+	writel(value, vic->regs + offset);
+}
+
+static int vic_runtime_resume(struct device *dev)
+{
+	struct vic *vic = dev_get_drvdata(dev);
+
+	return clk_prepare_enable(vic->clk);
+}
+
+static int vic_runtime_suspend(struct device *dev)
+{
+	struct vic *vic = dev_get_drvdata(dev);
+
+	clk_disable_unprepare(vic->clk);
+
+	vic->booted = false;
+
+	return 0;
+}
+
+static int vic_boot(struct vic *vic)
+{
+	u32 fce_ucode_size, fce_bin_data_offset;
+	void *hdr;
+	int err = 0;
+
+	if (vic->booted)
+		return 0;
+
+	/* setup clockgating registers */
+	vic_writel(vic, CG_IDLE_CG_DLY_CNT(4) |
+			CG_IDLE_CG_EN |
+			CG_WAKEUP_DLY_CNT(4),
+		   NV_PVIC_MISC_PRI_VIC_CG);
+
+	err = falcon_boot(&vic->falcon);
+	if (err < 0)
+		return err;
+
+	hdr = vic->falcon.firmware.vaddr;
+	fce_bin_data_offset = *(u32 *)(hdr + VIC_UCODE_FCE_DATA_OFFSET);
+	hdr = vic->falcon.firmware.vaddr +
+		*(u32 *)(hdr + VIC_UCODE_FCE_HEADER_OFFSET);
+	fce_ucode_size = *(u32 *)(hdr + FCE_UCODE_SIZE_OFFSET);
+
+	falcon_execute_method(&vic->falcon, VIC_SET_APPLICATION_ID, 1);
+	falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_SIZE,
+			      fce_ucode_size);
+	falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_OFFSET,
+			      (vic->falcon.firmware.paddr + fce_bin_data_offset)
+				>> 8);
+
+	err = falcon_wait_idle(&vic->falcon);
+	if (err < 0) {
+		dev_err(vic->dev,
+			"failed to set application ID and FCE base\n");
+		return err;
+	}
+
+	vic->booted = true;
+
+	return 0;
+}
+
+static void *vic_falcon_alloc(struct falcon *falcon, size_t size,
+			       dma_addr_t *iova)
+{
+	struct tegra_drm *tegra = falcon->data;
+
+	return tegra_drm_alloc(tegra, size, iova);
+}
+
+static void vic_falcon_free(struct falcon *falcon, size_t size,
+			    dma_addr_t iova, void *va)
+{
+	struct tegra_drm *tegra = falcon->data;
+
+	return tegra_drm_free(tegra, size, va, iova);
+}
+
+static const struct falcon_ops vic_falcon_ops = {
+	.alloc = vic_falcon_alloc,
+	.free = vic_falcon_free
+};
+
+static int vic_init(struct host1x_client *client)
+{
+	struct tegra_drm_client *drm = host1x_to_drm_client(client);
+	struct drm_device *dev = dev_get_drvdata(client->parent);
+	struct tegra_drm *tegra = dev->dev_private;
+	struct vic *vic = to_vic(drm);
+	int err;
+
+	if (tegra->domain) {
+		err = iommu_attach_device(tegra->domain, vic->dev);
+		if (err < 0) {
+			dev_err(vic->dev, "failed to attach to domain: %d\n",
+				err);
+			return err;
+		}
+
+		vic->domain = tegra->domain;
+	}
+
+	if (!vic->falcon.data) {
+		vic->falcon.data = tegra;
+		err = falcon_load_firmware(&vic->falcon);
+		if (err < 0)
+			goto detach_device;
+	}
+
+	vic->channel = host1x_channel_request(client->dev);
+	if (!vic->channel) {
+		err = -ENOMEM;
+		goto detach_device;
+	}
+
+	client->syncpts[0] = host1x_syncpt_request(client->dev, 0);
+	if (!client->syncpts[0]) {
+		err = -ENOMEM;
+		goto free_channel;
+	}
+
+	err = tegra_drm_register_client(tegra, drm);
+	if (err < 0)
+		goto free_syncpt;
+
+	return 0;
+
+free_syncpt:
+	host1x_syncpt_free(client->syncpts[0]);
+free_channel:
+	host1x_channel_free(vic->channel);
+detach_device:
+	if (tegra->domain)
+		iommu_detach_device(tegra->domain, vic->dev);
+
+	return err;
+}
+
+static int vic_exit(struct host1x_client *client)
+{
+	struct tegra_drm_client *drm = host1x_to_drm_client(client);
+	struct drm_device *dev = dev_get_drvdata(client->parent);
+	struct tegra_drm *tegra = dev->dev_private;
+	struct vic *vic = to_vic(drm);
+	int err;
+
+	err = tegra_drm_unregister_client(tegra, drm);
+	if (err < 0)
+		return err;
+
+	host1x_syncpt_free(client->syncpts[0]);
+	host1x_channel_free(vic->channel);
+
+	if (vic->domain) {
+		iommu_detach_device(vic->domain, vic->dev);
+		vic->domain = NULL;
+	}
+
+	return 0;
+}
+
+static const struct host1x_client_ops vic_client_ops = {
+	.init = vic_init,
+	.exit = vic_exit,
+};
+
+static int vic_open_channel(struct tegra_drm_client *client,
+			    struct tegra_drm_context *context)
+{
+	struct vic *vic = to_vic(client);
+	int err;
+
+	err = pm_runtime_get_sync(vic->dev);
+	if (err < 0)
+		return err;
+
+	err = vic_boot(vic);
+	if (err < 0) {
+		pm_runtime_put(vic->dev);
+		return err;
+	}
+
+	context->channel = host1x_channel_get(vic->channel);
+	if (!context->channel) {
+		pm_runtime_put(vic->dev);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void vic_close_channel(struct tegra_drm_context *context)
+{
+	struct vic *vic = to_vic(context->client);
+
+	host1x_channel_put(context->channel);
+
+	pm_runtime_put(vic->dev);
+}
+
+static const struct tegra_drm_client_ops vic_ops = {
+	.open_channel = vic_open_channel,
+	.close_channel = vic_close_channel,
+	.submit = tegra_drm_submit,
+};
+
+static const struct vic_config vic_t124_config = {
+	.firmware = "nvidia/tegra124/vic03_ucode.bin",
+};
+
+static const struct vic_config vic_t210_config = {
+	.firmware = "nvidia/tegra210/vic04_ucode.bin",
+};
+
+static const struct of_device_id vic_match[] = {
+	{ .compatible = "nvidia,tegra124-vic", .data = &vic_t124_config },
+	{ .compatible = "nvidia,tegra210-vic", .data = &vic_t210_config },
+	{ },
+};
+
+static int vic_probe(struct platform_device *pdev)
+{
+	struct vic_config *vic_config = NULL;
+	struct device *dev = &pdev->dev;
+	struct host1x_syncpt **syncpts;
+	struct resource *regs;
+	const struct of_device_id *match;
+	struct vic *vic;
+	int err;
+
+	match = of_match_device(vic_match, dev);
+	vic_config = (struct vic_config *)match->data;
+
+	vic = devm_kzalloc(dev, sizeof(*vic), GFP_KERNEL);
+	if (!vic)
+		return -ENOMEM;
+
+	syncpts = devm_kzalloc(dev, sizeof(*syncpts), GFP_KERNEL);
+	if (!syncpts)
+		return -ENOMEM;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!regs) {
+		dev_err(&pdev->dev, "failed to get registers\n");
+		return -ENXIO;
+	}
+
+	vic->regs = devm_ioremap_resource(dev, regs);
+	if (IS_ERR(vic->regs))
+		return PTR_ERR(vic->regs);
+
+	vic->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(vic->clk)) {
+		dev_err(&pdev->dev, "failed to get clock\n");
+		return PTR_ERR(vic->clk);
+	}
+
+	vic->falcon.dev = dev;
+	vic->falcon.regs = vic->regs;
+	vic->falcon.ops = &vic_falcon_ops;
+
+	err = falcon_init(&vic->falcon);
+	if (err < 0)
+		return err;
+
+	err = falcon_read_firmware(&vic->falcon, vic_config->firmware);
+	if (err < 0)
+		goto exit_falcon;
+
+	platform_set_drvdata(pdev, vic);
+
+	INIT_LIST_HEAD(&vic->client.base.list);
+	vic->client.base.ops = &vic_client_ops;
+	vic->client.base.dev = dev;
+	vic->client.base.class = HOST1X_CLASS_VIC;
+	vic->client.base.syncpts = syncpts;
+	vic->client.base.num_syncpts = 1;
+	vic->dev = dev;
+	vic->config = vic_config;
+
+	INIT_LIST_HEAD(&vic->client.list);
+	vic->client.ops = &vic_ops;
+
+	err = host1x_client_register(&vic->client.base);
+	if (err < 0) {
+		dev_err(dev, "failed to register host1x client: %d\n", err);
+		platform_set_drvdata(pdev, NULL);
+		goto exit_falcon;
+	}
+
+	pm_runtime_enable(&pdev->dev);
+	if (!pm_runtime_enabled(&pdev->dev)) {
+		err = vic_runtime_resume(&pdev->dev);
+		if (err < 0)
+			goto unregister_client;
+	}
+
+	return 0;
+
+unregister_client:
+	host1x_client_unregister(&vic->client.base);
+exit_falcon:
+	falcon_exit(&vic->falcon);
+
+	return err;
+}
+
+static int vic_remove(struct platform_device *pdev)
+{
+	struct vic *vic = platform_get_drvdata(pdev);
+	int err;
+
+	err = host1x_client_unregister(&vic->client.base);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to unregister host1x client: %d\n",
+			err);
+		return err;
+	}
+
+	if (pm_runtime_enabled(&pdev->dev))
+		pm_runtime_disable(&pdev->dev);
+	else
+		vic_runtime_suspend(&pdev->dev);
+
+	falcon_exit(&vic->falcon);
+
+	return 0;
+}
+
+static const struct dev_pm_ops vic_pm_ops = {
+	SET_RUNTIME_PM_OPS(vic_runtime_suspend, vic_runtime_resume, NULL)
+};
+
+struct platform_driver tegra_vic_driver = {
+	.driver = {
+		.name = "tegra-vic",
+		.of_match_table = vic_match,
+		.pm = &vic_pm_ops
+	},
+	.probe = vic_probe,
+	.remove = vic_remove,
+};
diff --git a/drivers/gpu/drm/tegra/vic.h b/drivers/gpu/drm/tegra/vic.h
new file mode 100644
index 000000000000..21844817a7e1
--- /dev/null
+++ b/drivers/gpu/drm/tegra/vic.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef TEGRA_VIC_H
+#define TEGRA_VIC_H
+
+/* VIC methods */
+
+#define VIC_SET_APPLICATION_ID			0x00000200
+#define VIC_SET_FCE_UCODE_SIZE			0x0000071C
+#define VIC_SET_FCE_UCODE_OFFSET		0x0000072C
+
+/* VIC registers */
+
+#define NV_PVIC_MISC_PRI_VIC_CG			0x000016d0
+#define CG_IDLE_CG_DLY_CNT(val)			((val & 0x3f) << 0)
+#define CG_IDLE_CG_EN				(1 << 6)
+#define CG_WAKEUP_DLY_CNT(val)			((val & 0xf) << 16)
+
+/* Firmware offsets */
+
+#define VIC_UCODE_FCE_HEADER_OFFSET		(6*4)
+#define VIC_UCODE_FCE_DATA_OFFSET		(7*4)
+#define FCE_UCODE_SIZE_OFFSET			(2*4)
+
+#endif /* TEGRA_VIC_H */
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 1ffbf2a8cb99..3d04aa1dc83e 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -26,6 +26,7 @@ enum host1x_class {
 	HOST1X_CLASS_HOST1X = 0x1,
 	HOST1X_CLASS_GR2D = 0x51,
 	HOST1X_CLASS_GR2D_SB = 0x52,
+	HOST1X_CLASS_VIC = 0x5D,
 	HOST1X_CLASS_GR3D = 0x60,
 };
 

From 7e7d432c5a736e7106c8700b65d8c31b93bd1c82 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 21 Mar 2017 08:54:21 +0100
Subject: [PATCH 15/17] gpu: host1x: Sort includes alphabetically

Sorting includes alphabetically makes it easier and less conflict-prone
to add new includes subsequently.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/dev.c | 17 +++++++++--------
 drivers/gpu/host1x/dev.h |  8 ++++----
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index b386a0bf828a..8a0d97243c9c 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -16,24 +16,25 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
 #include <linux/clk.h>
-#include <linux/io.h>
 #include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of.h>
+#include <linux/slab.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/host1x.h>
 #undef CREATE_TRACE_POINTS
 
 #include "bus.h"
-#include "dev.h"
-#include "intr.h"
 #include "channel.h"
 #include "debug.h"
+#include "dev.h"
+#include "intr.h"
+
 #include "hw/host1x01.h"
 #include "hw/host1x02.h"
 #include "hw/host1x04.h"
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index e5113acecd7a..561c5776cafb 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -17,16 +17,16 @@
 #ifndef HOST1X_DEV_H
 #define HOST1X_DEV_H
 
-#include <linux/platform_device.h>
 #include <linux/device.h>
 #include <linux/iommu.h>
 #include <linux/iova.h>
+#include <linux/platform_device.h>
 
-#include "channel.h"
-#include "syncpt.h"
-#include "intr.h"
 #include "cdma.h"
+#include "channel.h"
+#include "intr.h"
 #include "job.h"
+#include "syncpt.h"
 
 struct host1x_syncpt;
 struct host1x_syncpt_base;

From b386c6b73ac6c2a9a2f1201d055ab65cc19890a2 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 21 Mar 2017 08:54:22 +0100
Subject: [PATCH 16/17] gpu: host1x: Support module reset

Newer versions of Tegra come with early boot software that aggressively
puts various modules in reset. Add support to the host1x driver to take
the module out of reset on probe, and assert reset on removal.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/dev.c | 18 +++++++++++++++++-
 drivers/gpu/host1x/dev.h |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 8a0d97243c9c..f05ebb14fa63 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -170,6 +170,13 @@ static int host1x_probe(struct platform_device *pdev)
 		return err;
 	}
 
+	host->rst = devm_reset_control_get(&pdev->dev, "host1x");
+	if (IS_ERR(host->rst)) {
+		err = PTR_ERR(host->clk);
+		dev_err(&pdev->dev, "failed to get reset: %d\n", err);
+		return err;
+	}
+
 	if (iommu_present(&platform_bus_type)) {
 		struct iommu_domain_geometry *geometry;
 		unsigned long order;
@@ -203,10 +210,16 @@ static int host1x_probe(struct platform_device *pdev)
 		goto fail_detach_device;
 	}
 
+	err = reset_control_deassert(host->rst);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to deassert reset: %d\n", err);
+		goto fail_unprepare_disable;
+	}
+
 	err = host1x_syncpt_init(host);
 	if (err) {
 		dev_err(&pdev->dev, "failed to initialize syncpts\n");
-		goto fail_unprepare_disable;
+		goto fail_reset_assert;
 	}
 
 	err = host1x_intr_init(host, syncpt_irq);
@@ -227,6 +240,8 @@ fail_deinit_intr:
 	host1x_intr_deinit(host);
 fail_deinit_syncpt:
 	host1x_syncpt_deinit(host);
+fail_reset_assert:
+	reset_control_assert(host->rst);
 fail_unprepare_disable:
 	clk_disable_unprepare(host->clk);
 fail_detach_device:
@@ -248,6 +263,7 @@ static int host1x_remove(struct platform_device *pdev)
 	host1x_unregister(host);
 	host1x_intr_deinit(host);
 	host1x_syncpt_deinit(host);
+	reset_control_assert(host->rst);
 	clk_disable_unprepare(host->clk);
 
 	if (host->domain) {
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 561c5776cafb..229d08b6a45e 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -21,6 +21,7 @@
 #include <linux/iommu.h>
 #include <linux/iova.h>
 #include <linux/platform_device.h>
+#include <linux/reset.h>
 
 #include "cdma.h"
 #include "channel.h"
@@ -109,6 +110,7 @@ struct host1x {
 	struct host1x_syncpt_base *bases;
 	struct device *dev;
 	struct clk *clk;
+	struct reset_control *rst;
 
 	struct iommu_domain *domain;
 	struct iova_domain iova;

From b0d36daa0ab54714e05164f6e21d22f974a5eec1 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 22 Mar 2017 19:15:18 +0100
Subject: [PATCH 17/17] gpu: host1x: Fix host1x driver shutdown

Shutting down a host1x device currently crashes if the device has failed
to probe. The root cause is that the host1x shutdown is implemented as a
struct bus_type callback, but in turn relies on the driver bound to the
device. On failure to probe, no driver will be bound and cause the code
to crash.

Fix this by moving the ->probe(), ->remove() and ->shutdown() callbacks
to the driver rather than the bus.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 68 ++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index eeb021fe6410..561831e1ae2c 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -267,37 +267,6 @@ static int host1x_device_match(struct device *dev, struct device_driver *drv)
 	return strcmp(dev_name(dev), drv->name) == 0;
 }
 
-static int host1x_device_probe(struct device *dev)
-{
-	struct host1x_driver *driver = to_host1x_driver(dev->driver);
-	struct host1x_device *device = to_host1x_device(dev);
-
-	if (driver->probe)
-		return driver->probe(device);
-
-	return 0;
-}
-
-static int host1x_device_remove(struct device *dev)
-{
-	struct host1x_driver *driver = to_host1x_driver(dev->driver);
-	struct host1x_device *device = to_host1x_device(dev);
-
-	if (driver->remove)
-		return driver->remove(device);
-
-	return 0;
-}
-
-static void host1x_device_shutdown(struct device *dev)
-{
-	struct host1x_driver *driver = to_host1x_driver(dev->driver);
-	struct host1x_device *device = to_host1x_device(dev);
-
-	if (driver->shutdown)
-		driver->shutdown(device);
-}
-
 static const struct dev_pm_ops host1x_device_pm_ops = {
 	.suspend = pm_generic_suspend,
 	.resume = pm_generic_resume,
@@ -310,9 +279,6 @@ static const struct dev_pm_ops host1x_device_pm_ops = {
 struct bus_type host1x_bus_type = {
 	.name = "host1x",
 	.match = host1x_device_match,
-	.probe = host1x_device_probe,
-	.remove = host1x_device_remove,
-	.shutdown = host1x_device_shutdown,
 	.pm = &host1x_device_pm_ops,
 };
 
@@ -516,6 +482,37 @@ int host1x_unregister(struct host1x *host1x)
 	return 0;
 }
 
+static int host1x_device_probe(struct device *dev)
+{
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->probe)
+		return driver->probe(device);
+
+	return 0;
+}
+
+static int host1x_device_remove(struct device *dev)
+{
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->remove)
+		return driver->remove(device);
+
+	return 0;
+}
+
+static void host1x_device_shutdown(struct device *dev)
+{
+	struct host1x_driver *driver = to_host1x_driver(dev->driver);
+	struct host1x_device *device = to_host1x_device(dev);
+
+	if (driver->shutdown)
+		driver->shutdown(device);
+}
+
 int host1x_driver_register_full(struct host1x_driver *driver,
 				struct module *owner)
 {
@@ -536,6 +533,9 @@ int host1x_driver_register_full(struct host1x_driver *driver,
 
 	driver->driver.bus = &host1x_bus_type;
 	driver->driver.owner = owner;
+	driver->driver.probe = host1x_device_probe;
+	driver->driver.remove = host1x_device_remove;
+	driver->driver.shutdown = host1x_device_shutdown;
 
 	return driver_register(&driver->driver);
 }