fb/ramgk20a: support IOMMU if presentgk20a_next

Tegra SoCs come with an IOMMU that can make physically non-contiguous regions of memory appear contiguous to the GPU. This patch adds support for this unit if present. This greatly optimizes memory usage and allows GK20A to perform without CMA. Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
author: Alexandre Courbot <acourbot@nvidia.com> 2014-10-30 18:21:25 +0900
committer: Alexandre Courbot <acourbot@nvidia.com> 2014-11-11 18:59:43 +0900
commit: b0cfd481fffce1302b4221fb0e56baff6fe2c994 (patch)
tree: f47136794eae6f0b6254199ba1b379940016b866
parent: d337b253ef0562d4f02a06bfdbd44445e2938b7a (diff)
download: nouveau-gk20a_next.tar.gz
3 files changed, 240 insertions, 14 deletions
diff --git a/drm/nv84_fence.c b/drm/nv84_fence.c
index 4d79be755..1adb246ab 100644
--- a/drm/nv84_fence.c
+++ b/drm/nv84_fence.c
@@ -232,9 +232,9 @@ nv84_fence_create(struct nouveau_drm *drm)
 	priv->base.uevent = true;
 
 	ret = nouveau_bo_new(drm->dev, 16 * priv->base.contexts, 0,
-			     TTM_PL_FLAG_VRAM, 0, 0, NULL, NULL, &priv->bo);
+			     TTM_PL_FLAG_TT | TTM_PL_FLAG_UNCACHED, 0, 0, NULL, NULL, &priv->bo);
 	if (ret == 0) {
-		ret = nouveau_bo_pin(priv->bo, TTM_PL_FLAG_VRAM);
+		ret = nouveau_bo_pin(priv->bo, TTM_PL_FLAG_TT);
 		if (ret == 0) {
 			ret = nouveau_bo_map(priv->bo);
 			if (ret)
diff --git a/nvkm/subdev/fb/ramgk20a.c b/nvkm/subdev/fb/ramgk20a.c
index ba87b8ce6..1df261ca6 100644
--- a/nvkm/subdev/fb/ramgk20a.c
+++ b/nvkm/subdev/fb/ramgk20a.c
@@ -24,15 +24,53 @@
 
 #include <subdev/fb.h>
 
+#include <linux/iommu.h>
+#include <linux/platform_device.h>
+
+/*
+ * GK20A's fake VRAM can be allocated in different ways. depending on whether
+ * an IOMMU is available or not.
+ *
+ * If an IOMMU is not present, contiguous coherent memory is allocated using
+ * the DMA API and the physical address is used. This is slow and requires the
+ * use of CMA, so this is not recommended.
+ *
+ * If an IOMMU is present, physical memory is allocated with page granularity
+ * and mapped into the GPU IOMMU address space (not to be mistaken with the GPU
+ * address space!). This solution is much preferred as it does not rely on
+ * contiguous physical memory.
+ *
+ */
 struct gk20a_mem {
 	struct nouveau_mem base;
-	void *cpuaddr;
-	dma_addr_t handle;
+	union {
+		/*
+		 * For IOMMU-less systems: allocate using DMA API
+		 */
+		struct {
+			void *cpuaddr;
+			dma_addr_t handle;
+		};
+		/*
+		 * For systems with IOMMU: map individual pages
+		 */
+		struct {
+			u32 npages;
+			struct page* pages[0];
+		};
+	};
 };
 #define to_gk20a_mem(m) container_of(m, struct gk20a_mem, base)
 
+struct gk20a_ram {
+	struct nouveau_ram base;
+
+	struct nouveau_mm mm;
+	struct iommu_domain *domain;
+};
+
 static void
-gk20a_ram_put(struct nouveau_fb *pfb, struct nouveau_mem **pmem)
+gk20a_ram_put_dma(struct nouveau_fb *pfb, struct nouveau_mem **pmem)
 {
 	struct device *dev = nv_device_base(nv_device(pfb));
 	struct gk20a_mem *mem = to_gk20a_mem(*pmem);
@@ -52,8 +90,8 @@ gk20a_ram_put(struct nouveau_fb *pfb, struct nouveau_mem **pmem)
 }
 
 static int
-gk20a_ram_get(struct nouveau_fb *pfb, u64 size, u32 align, u32 ncmin,
-	     u32 memtype, struct nouveau_mem **pmem)
+gk20a_ram_get_dma(struct nouveau_fb *pfb, u64 size, u32 align, u32 ncmin,
+			u32 memtype, struct nouveau_mem **pmem)
 {
 	struct device *dev = nv_device_base(nv_device(pfb));
 	struct gk20a_mem *mem;
@@ -95,7 +133,7 @@ gk20a_ram_get(struct nouveau_fb *pfb, u64 size, u32 align, u32 ncmin,
 					  &mem->handle, GFP_KERNEL);
 	if (!mem->cpuaddr) {
 		nv_error(pfb, "%s: cannot allocate memory!\n", __func__);
-		gk20a_ram_put(pfb, pmem);
+		gk20a_ram_put_dma(pfb, pmem);
 		return -ENOMEM;
 	}
 
@@ -122,32 +160,215 @@ gk20a_ram_get(struct nouveau_fb *pfb, u64 size, u32 align, u32 ncmin,
 	return 0;
 }
 
+static void
+gk20a_ram_put_iommu(struct nouveau_fb *pfb, struct nouveau_mem **pmem)
+{
+	struct gk20a_ram *ram = (struct gk20a_ram *)pfb->ram;
+	struct gk20a_mem *mem = to_gk20a_mem(*pmem);
+	struct nouveau_mm_node *r;
+	int i;
+
+	*pmem = NULL;
+	if (unlikely(mem == NULL))
+		return;
+
+	r = list_first_entry(&mem->base.regions, struct nouveau_mm_node, rl_entry);
+
+	/* Unmap pages from GPU address space and free them */
+	for (i = 0; i < mem->npages; i++) {
+		iommu_unmap(ram->domain, (r->offset + i) << 12, PAGE_SIZE);
+		__free_page(mem->pages[i]);
+	}
+
+	/* Release area from GPU address space */
+	nouveau_mm_free(&ram->mm, &r);
+
+	/* Free mem struct */
+	kfree(mem);
+}
+
+static int
+gk20a_ram_get_iommu(struct nouveau_fb *pfb, u64 size, u32 align, u32 ncmin,
+		    u32 memtype, struct nouveau_mem **pmem)
+{
+	struct gk20a_ram *ram = (struct gk20a_ram *)pfb->ram;
+	struct gk20a_mem *mem;
+	u32 type = memtype & 0xff;
+	struct nouveau_mm_node *r;
+	u32 npages;
+	int err;
+	int i;
+
+	nv_debug(pfb, "%s: size: %llx align: %x, ncmin: %x\n", __func__, size,
+		 align, ncmin);
+
+	npages = size >> PAGE_SHIFT;
+	if (npages == 0)
+		npages = 1;
+
+	if (align == 0)
+		/* TODO should be size of page for IOMMU! */
+		align = PAGE_SIZE;
+
+	/* Create mem structure */
+	mem = kzalloc(sizeof(*mem) + npages * sizeof(mem->pages[0]),
+		      GFP_KERNEL);
+	if (!mem)
+		return -ENOMEM;
+
+	/* Allocate backing memory */
+	mem->npages = npages;
+	for (i = 0; i < npages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+
+		if (p == NULL) {
+			err = -ENOMEM;
+			goto free_pages;
+		}
+		mem->pages[i] = p;
+	}
+
+	/* Reserve area of GPU address space */
+	/* TODO same here */
+	err = nouveau_mm_head(&ram->mm, 0, 1, npages, npages, align >> 12, &r);
+	if (err) {
+		nv_error(pfb, "virtual space is full!\n");
+		goto free_pages;
+	}
+
+	/* Map into GPU address space */
+	for (i = 0; i < npages; i++) {
+		struct page *p = mem->pages[i];
+		u32 offset = (r->offset + i) << 12;
+
+		/* TODO and here */
+		err = iommu_map(ram->domain, offset, page_to_phys(p),
+				PAGE_SIZE, IOMMU_READ | IOMMU_WRITE);
+		if (err < 0) {
+			nv_error(pfb, "cannot map memory into IOMMU: %d\n", err);
+
+			while (i-- > 0) {
+				offset -= PAGE_SIZE;
+				iommu_unmap(ram->domain, offset, PAGE_SIZE);
+			}
+			goto release_area;
+		}
+		/* TODO won't work if PAGE_SHIFT != 12 */
+		offset += PAGE_SIZE;
+	}
+
+	mem->base.size = npages;
+	mem->base.memtype = type;
+	/* Bit 34 tells that an address is to be resolved through the IOMMU */
+	mem->base.offset = (u64)(r->offset << 12) | BIT_ULL(34);
+
+	INIT_LIST_HEAD(&mem->base.regions);
+	list_add_tail(&r->rl_entry, &mem->base.regions);
+
+	*pmem = &mem->base;
+
+	goto end;
+
+release_area:
+	nouveau_mm_free(&ram->mm, &r);
+
+free_pages:
+	for (i = 0; i < npages && mem->pages[i] != NULL; i++)
+		__free_page(mem->pages[i]);
+
+	kfree(mem);
+
+end:
+	nv_debug(pfb, "alloc size: 0x%x, align: 0x%x, gaddr: 0x%llx\n",
+		 npages << PAGE_SHIFT, align, mem->base.offset);
+
+	return err;
+}
+
 static int
 gk20a_ram_ctor(struct nouveau_object *parent, struct nouveau_object *engine,
 	      struct nouveau_oclass *oclass, void *data, u32 datasize,
 	      struct nouveau_object **pobject)
 {
-	struct nouveau_ram *ram;
+	struct gk20a_ram *ram;
+	struct device *dev = nv_device_base(nv_device(parent));
 	int ret;
 
 	ret = nouveau_ram_create(parent, engine, oclass, &ram);
 	*pobject = nv_object(ram);
 	if (ret)
 		return ret;
-	ram->type = NV_MEM_TYPE_STOLEN;
-	ram->size = get_num_physpages() << PAGE_SHIFT;
 
-	ram->get = gk20a_ram_get;
-	ram->put = gk20a_ram_put;
+	ram->base.type = NV_MEM_TYPE_STOLEN;
+	ram->base.size = get_num_physpages() << PAGE_SHIFT;
+	ram->base.get = gk20a_ram_get_dma;
+	ram->base.put = gk20a_ram_put_dma;
+
+	/* If IOMMU is present and set up correctly, use it */
+	if (iommu_present(&platform_bus_type)) {
+		ram->domain = iommu_domain_alloc(&platform_bus_type);
+		if (IS_ERR(ram->domain)) {
+			nv_error(parent, "cannot allocate IOMMU domain!\n");
+			nv_error(parent, "falling back to DMA API...\n");
+			goto end;
+		}
+
+		ret = iommu_attach_device(ram->domain, dev);
+		if (ret) {
+			nv_error(parent, "cannot attach device to IOMMU\n");
+			nv_error(parent, "falling back to DMA API...\n");
+			goto free_domain;
+		}
+
+		/* TODO use shift of IOMMU! */
+		ret = nouveau_mm_init(&ram->mm, 0, SZ_2G >> 12, 1);
+		if (ret) {
+			nv_error(ram, "cannot initialize MM\n");
+			nv_error(parent, "falling back to DMA API...\n");
+			goto detach_device;
+		}
+
+		/* All set, we can use the IOMMU to handle VRAM! */
+		ram->base.get = gk20a_ram_get_iommu;
+		ram->base.put = gk20a_ram_put_iommu;
+
+		nv_info(parent, "using IOMMU\n");
+	} else {
+		nv_info(parent, "using DMA API\n");
+	}
 
 	return 0;
+
+detach_device:
+	iommu_detach_device(ram->domain, dev);
+free_domain:
+	iommu_domain_free(ram->domain);
+end:
+	return 0;
+}
+
+static void
+gk20a_ram_dtor(struct nouveau_object *obj)
+{
+	struct gk20a_ram *ram = (void *)obj;
+	struct device *dev = nv_device_base(nv_device(ram));
+	int ret;
+
+	if (ram->domain) {
+		ret = nouveau_mm_fini(&ram->mm);
+		if (ret)
+			nv_warn(obj, "cannot clear MM: %d\n", ret);
+
+		iommu_detach_device(ram->domain, dev);
+		iommu_domain_free(ram->domain);
+	}
 }
 
 struct nouveau_oclass
 gk20a_ram_oclass = {
 	.ofuncs = &(struct nouveau_ofuncs) {
 		.ctor = gk20a_ram_ctor,
-		.dtor = _nouveau_ram_dtor,
+		.dtor = gk20a_ram_dtor,
 		.init = _nouveau_ram_init,
 		.fini = _nouveau_ram_fini,
 	},
diff --git a/nvkm/subdev/vm/nvc0.c b/nvkm/subdev/vm/nvc0.c
index 2d0988755..160e96e5d 100644
--- a/nvkm/subdev/vm/nvc0.c
+++ b/nvkm/subdev/vm/nvc0.c
@@ -94,6 +94,10 @@ nvc0_vm_map_pgt(struct nouveau_gpuobj *pgd, u32 index,
 static inline u64
 nvc0_vm_addr(struct nouveau_vma *vma, u64 phys, u32 memtype, u32 target)
 {
+	if (phys & (1ULL << 34)) {
+		printk("%s %llx using IOMMU!\n", __func__, phys & ~(1ULL << 34));
+	}
+
 	phys >>= 8;
 
 	phys |= 0x00000001; /* present */
@@ -112,6 +116,7 @@ nvc0_vm_map(struct nouveau_vma *vma, struct nouveau_gpuobj *pgt,
 {
 	u64 next = 1 << (vma->node->type - 8);
 
+	phys |= (1ULL << 34);
 	phys  = nvc0_vm_addr(vma, phys, mem->memtype, 0);
 	pte <<= 3;
author	Alexandre Courbot <acourbot@nvidia.com>	2014-10-30 18:21:25 +0900
committer	Alexandre Courbot <acourbot@nvidia.com>	2014-11-11 18:59:43 +0900
commit	b0cfd481fffce1302b4221fb0e56baff6fe2c994 (patch)
tree	f47136794eae6f0b6254199ba1b379940016b866
parent	d337b253ef0562d4f02a06bfdbd44445e2938b7a (diff)
download	nouveau-gk20a_next.tar.gz