drm/vc4: Add a BO cache for freed BOs.

The kernel needs to allocate BOs to execute rendering (both the exec BO to store the user's commands, and the binner overflow BO to handle the hardware's own needs for memory). We need to cache these, because getting a new CMA allocation is very expensive. Note that userspace has its own BO cache because it wants to avoid the cost of mmaping BOs in addition to the cost of allocation. This is about a 15% performance improvement on glxgears under X, which is thoroughly CPU limited and not hitting refresh rate yet. Next step is to cache shader validation (half of the ioctl overhead) and avoid unnecessary depth writes (a big chunk of memory bandwidth overhead). Signed-off-by: Eric Anholt <eric@anholt.net>
author: Eric Anholt <eric@anholt.net> 2014-12-18 11:31:38 -0800
committer: Eric Anholt <eric@anholt.net> 2015-06-04 14:15:28 -0700
commit: c00050d0a663f2dc8d2f6399097dcef58c495434 (patch)
tree: 7dfab4259822da20b3fd96a978a2fcc3c0754b69
parent: 0022f7fa57d6009812d8558bf7afe5f22a9bcf4b (diff)
download: linux-c00050d0a663f2dc8d2f6399097dcef58c495434.tar.gz
5 files changed, 208 insertions, 10 deletions
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 1459395d6e94..8bfca3b0b3e1 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -16,12 +16,75 @@
  */
 
 #include "vc4_drv.h"
+#include "uapi/drm/vc4_drm.h"
+
+static uint32_t
+bo_page_index(size_t size)
+{
+	return (size / PAGE_SIZE) - 1;
+}
+
+static struct list_head *
+vc4_get_cache_list_for_size(struct drm_device *dev, size_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+
+	if (vc4->bo_cache.size_list_size <= page_index) {
+		uint32_t new_size = max(vc4->bo_cache.size_list_size * 2,
+					page_index + 1);
+		struct list_head *new_list;
+		uint32_t i;
+
+		new_list = kmalloc(new_size * sizeof(struct list_head),
+				   GFP_KERNEL);
+		if (!new_list)
+			return NULL;
+
+		/* Rebase the old cached BO lists to their new list
+		 * head locations.
+		 */
+		for (i = 0; i < vc4->bo_cache.size_list_size; i++) {
+			struct list_head *old_list = &vc4->bo_cache.size_list[i];
+			if (list_empty(old_list))
+				INIT_LIST_HEAD(&new_list[i]);
+			else
+				list_replace(old_list, &new_list[i]);
+		}
+		/* And initialize the brand new BO list heads. */
+		for (i = vc4->bo_cache.size_list_size; i < new_size; i++)
+			INIT_LIST_HEAD(&new_list[i]);
+
+		kfree(vc4->bo_cache.size_list);
+		vc4->bo_cache.size_list = new_list;
+		vc4->bo_cache.size_list_size = new_size;
+	}
+
+	return &vc4->bo_cache.size_list[page_index];
+}
 
 struct vc4_bo *
 vc4_bo_create(struct drm_device *dev, size_t size)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+	struct vc4_bo *bo = NULL;
 	struct drm_gem_cma_object *cma_obj;
 
+	/* First, try to get a vc4_bo from the kernel BO cache. */
+	if (vc4->bo_cache.size_list_size > page_index) {
+		if (!list_empty(&vc4->bo_cache.size_list[page_index])) {
+			bo = list_first_entry(&vc4->bo_cache.size_list[page_index],
+					      struct vc4_bo, size_head);
+			list_del(&bo->size_head);
+			list_del(&bo->unref_head);
+		}
+	}
+	if (bo) {
+		kref_init(&bo->base.base.refcount);
+		return bo;
+	}
+
 	/* Otherwise, make a new BO. */
 	cma_obj = drm_gem_cma_create(dev, size);
 	if (IS_ERR(cma_obj))
@@ -56,3 +119,98 @@ vc4_dumb_create(struct drm_file *file_priv,
 
 	return ret;
 }
+
+static void
+vc4_bo_cache_free_old(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long expire_time = jiffies - msecs_to_jiffies(1000);
+
+	while (!list_empty(&vc4->bo_cache.time_list)) {
+		struct vc4_bo *bo = list_last_entry(&vc4->bo_cache.time_list,
+						    struct vc4_bo, unref_head);
+		if (time_before(expire_time, bo->free_time)) {
+			mod_timer(&vc4->bo_cache.time_timer,
+				  round_jiffies_up(jiffies +
+						   msecs_to_jiffies(1000)));
+			return;
+		}
+
+		list_del(&bo->unref_head);
+		list_del(&bo->size_head);
+		drm_gem_cma_free_object(&bo->base.base);
+	}
+}
+
+/* Called on the last userspace/kernel unreference of the BO.  Returns
+ * it to the BO cache if possible, otherwise frees it.
+ *
+ * Note that this is called with the struct_mutex held.
+ */
+void
+vc4_free_object(struct drm_gem_object *gem_bo)
+{
+	struct drm_device *dev = gem_bo->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo *bo = to_vc4_bo(gem_bo);
+	struct list_head *cache_list;
+
+	/* If the object references someone else's memory, we can't cache it.
+	 */
+	if (gem_bo->import_attach) {
+		drm_gem_cma_free_object(gem_bo);
+		return;
+	}
+
+	/* Don't cache if it was publicly named. */
+	if (gem_bo->name) {
+		drm_gem_cma_free_object(gem_bo);
+		return;
+	}
+
+	cache_list = vc4_get_cache_list_for_size(dev, gem_bo->size);
+	if (!cache_list) {
+		drm_gem_cma_free_object(gem_bo);
+		return;
+	}
+
+	bo->free_time = jiffies;
+	list_add(&bo->size_head, cache_list);
+	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
+
+	vc4_bo_cache_free_old(dev);
+}
+
+static void
+vc4_bo_cache_time_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, bo_cache.time_work);
+	struct drm_device *dev = vc4->dev;
+
+	mutex_lock(&dev->struct_mutex);
+	vc4_bo_cache_free_old(dev);
+	mutex_unlock(&dev->struct_mutex);
+}
+
+static void
+vc4_bo_cache_time_timer(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	schedule_work(&vc4->bo_cache.time_work);
+}
+
+void
+vc4_bo_cache_init(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	INIT_LIST_HEAD(&vc4->bo_cache.time_list);
+
+	INIT_WORK(&vc4->bo_cache.time_work, vc4_bo_cache_time_work);
+	setup_timer(&vc4->bo_cache.time_timer,
+		    vc4_bo_cache_time_timer,
+		    (unsigned long) dev);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 7d59dc5b6d0d..0bdc0f9069af 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -138,7 +138,7 @@ static struct drm_driver vc4_drm_driver = {
 	.debugfs_cleanup = vc4_debugfs_cleanup,
 #endif
 
-	.gem_free_object = drm_gem_cma_free_object,
+	.gem_free_object = vc4_free_object,
 	.gem_vm_ops = &drm_gem_cma_vm_ops,
 
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 2d3a27332400..6d29b034b603 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -50,6 +50,27 @@ struct vc4_dev {
 	struct vc4_bo *overflow_mem;
 	struct work_struct overflow_mem_work;
 
+	/* The kernel-space BO cache.  Tracks buffers that have been
+	 * unreferenced by all other users (refcounts of 0!) but not
+	 * yet freed, so we can do cheap allocations.
+	 */
+	struct vc4_bo_cache {
+		/* Array of list heads for entries in the BO cache,
+		 * based on number of pages, so we can do O(1) lookups
+		 * in the cache when allocating.
+		 */
+		struct list_head *size_list;
+		uint32_t size_list_size;
+
+		/* List of all BOs in the cache, ordered by age, so we
+		 * can do O(1) lookups when trying to free old
+		 * buffers.
+		 */
+		struct list_head time_list;
+		struct work_struct time_work;
+		struct timer_list time_timer;
+	} bo_cache;
+
 	struct {
 		uint32_t last_ct0ca, last_ct1ca;
 		struct timer_list timer;
@@ -67,7 +88,17 @@ struct vc4_bo {
 	struct drm_gem_cma_object base;
 	/* seqno of the last job to render to this BO. */
 	uint64_t seqno;
+
+	/* List entry for the BO's position in either
+	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
+	 */
 	struct list_head unref_head;
+
+	/* Time in jiffies when the BO was put in vc4->bo_cache. */
+	unsigned long free_time;
+
+	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
+	struct list_head size_head;
 };
 
 static inline struct vc4_bo *
@@ -312,6 +343,14 @@ void vc4_disable_vblank(struct drm_device *dev, int crtc_id);
 
 #define wait_for(COND, MS) _wait_for(COND, MS, 1)
 
+/* vc4_bo.c */
+void vc4_bo_cache_init(struct drm_device *dev);
+void vc4_free_object(struct drm_gem_object *gem_obj);
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size);
+int vc4_dumb_create(struct drm_file *file_priv,
+		    struct drm_device *dev,
+		    struct drm_mode_create_dumb *args);
+
 /* vc4_debugfs.c */
 int vc4_debugfs_init(struct drm_minor *minor);
 void vc4_debugfs_cleanup(struct drm_minor *minor);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 37dd79a863b1..e0627f446667 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -341,6 +341,7 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 	uint32_t exec_size = uniforms_offset + args->uniforms_size;
 	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
 					  args->shader_rec_count);
+	struct vc4_bo *bo;
 
 	if (shader_rec_offset < render_offset ||
 	    uniforms_offset < shader_rec_offset ||
@@ -399,13 +400,13 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 
-	exec->exec_bo = drm_gem_cma_create(dev, exec_size);
-	if (IS_ERR(exec->exec_bo)) {
+	bo = vc4_bo_create(dev, 256 * 1024);
+	if (!bo) {
 		DRM_ERROR("Couldn't allocate BO for exec\n");
 		ret = PTR_ERR(exec->exec_bo);
-		exec->exec_bo = NULL;
 		goto fail;
 	}
+	exec->exec_bo = &bo->base;
 
 	list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
 		      &exec->unref_list);
@@ -597,4 +598,6 @@ vc4_gem_init(struct drm_device *dev)
 		    (unsigned long) dev);
 
 	INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
+
+	vc4_bo_cache_init(dev);
 }
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index c9a59b1c78e5..66201229801c 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -35,15 +35,13 @@ vc4_overflow_mem_work(struct work_struct *work)
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, overflow_mem_work);
 	struct drm_device *dev = vc4->dev;
-	struct drm_gem_cma_object *cma_obj;
 	struct vc4_bo *bo;
 
-	cma_obj = drm_gem_cma_create(dev, 256 * 1024);
-	if (IS_ERR(cma_obj)) {
+	bo = vc4_bo_create(dev, 256 * 1024);
+	if (!bo) {
 		DRM_ERROR("Couldn't allocate binner overflow mem\n");
 		return;
 	}
-	bo = to_vc4_bo(&cma_obj->base);
 
 	/* If there's a job executing currently, then our previous
 	 * overflow allocation is getting used in that job and we need
@@ -73,8 +71,8 @@ vc4_overflow_mem_work(struct work_struct *work)
 	}
 	vc4->overflow_mem = bo;
 
-	V3D_WRITE(V3D_BPOA, cma_obj->paddr);
-	V3D_WRITE(V3D_BPOS, cma_obj->base.size);
+	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOS, bo->base.base.size);
 	V3D_WRITE(V3D_INTDIS, 0);
 	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
 	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
author	Eric Anholt <eric@anholt.net>	2014-12-18 11:31:38 -0800
committer	Eric Anholt <eric@anholt.net>	2015-06-04 14:15:28 -0700
commit	c00050d0a663f2dc8d2f6399097dcef58c495434 (patch)
tree	7dfab4259822da20b3fd96a978a2fcc3c0754b69
parent	0022f7fa57d6009812d8558bf7afe5f22a9bcf4b (diff)
download	linux-c00050d0a663f2dc8d2f6399097dcef58c495434.tar.gz