drm/vc4: Introduce shader validation and better command stream validation.

This is an import of the code I developed up in Mesa against the simulator. It rewrites the kernel ABI to support validation of uniforms, which is required for relocating references to texture contents. Note that the docs in the shader validator are aspirational -- we don't yet handle the force-unmapping of shader BO contents, nor do we do any caching. v2: Don't forget to git add vc4_qpu_defines.h Signed-off-by: Eric Anholt <eric@anholt.net>
author: Eric Anholt <eric@anholt.net> 2014-08-05 10:04:10 -0700
committer: Eric Anholt <eric@anholt.net> 2015-06-04 14:15:17 -0700
commit: 51bd38241bdc3cee795668518dea68a567df72c9 (patch)
tree: 9f2cc9e6f12ea18f471e8e744601784bd510e53c
parent: d91801885a55146f4145233f4a14ddb6815541cb (diff)
download: linux-51bd38241bdc3cee795668518dea68a567df72c9.tar.gz
8 files changed, 1472 insertions, 243 deletions
diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index c35c07dfec8d..09dad06a75b9 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -14,6 +14,7 @@ vc4-y := \
 	vc4_plane.o \
 	vc4_v3d.o \
 	vc4_validate.o \
+	vc4_validate_shaders.o \
 	$()
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index a2eff695de68..e741627211a1 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -84,25 +84,36 @@ to_vc4_plane(struct drm_plane *plane)
 #define HVS_READ(offset) readl(vc4->hvs->regs + offset)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
+enum vc4_bo_mode {
+	VC4_MODE_UNDECIDED,
+	VC4_MODE_TILE_ALLOC,
+	VC4_MODE_TSDA,
+	VC4_MODE_RENDER,
+	VC4_MODE_SHADER,
+};
+
+struct vc4_bo_exec_state {
+	struct drm_gem_cma_object *bo;
+	enum vc4_bo_mode mode;
+};
+
 struct exec_info {
+	/* Kernel-space copy of the ioctl arguments */
+	struct drm_vc4_submit_cl *args;
+
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
-	struct drm_gem_cma_object **bo;
+	struct vc4_bo_exec_state *bo;
 	uint32_t bo_count;
 
-	/* Current indices into @bo loaded by the non-hardware packet
-	 * that passes in indices.  This can be used even without
-	 * checking that we've seen one of those packets, because
-	 * @bo_count is always >= 1, and this struct is initialized to
-	 * 0.
+	/* Current unvalidated indices into @bo loaded by the non-hardware
+	 * VC4_PACKET_GEM_HANDLES.
 	 */
 	uint32_t bo_index[2];
-	uint32_t max_width, max_height;
 
-	/**
-	 * This is the BO where we store the validated command lists
-	 * and shader records.
+	/* This is the BO where we store the validated command lists, shader
+	 * records, and uniforms.
 	 */
 	struct drm_gem_cma_object *exec_bo;
 
@@ -115,6 +126,10 @@ struct exec_info {
 	struct vc4_shader_state {
 		uint8_t packet;
 		uint32_t addr;
+		/* Maximum vertex index referenced by any primitive using this
+		 * shader state.
+		 */
+		uint32_t max_index;
 	} *shader_state;
 
 	/** How many shader states the user declared they were using. */
@@ -122,13 +137,74 @@ struct exec_info {
 	/** How many shader state records the validator has seen. */
 	uint32_t shader_state_count;
 
+	bool found_tile_binning_mode_config_packet;
+	bool found_tile_rendering_mode_config_packet;
+	bool found_start_tile_binning_packet;
+	uint8_t bin_tiles_x, bin_tiles_y;
+	uint32_t fb_width, fb_height;
+	uint32_t tile_alloc_init_block_size;
+	struct drm_gem_cma_object *tile_alloc_bo;
+
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
 	 * bin thread (ct0) and render thread (ct1).
 	 */
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
-	uint32_t shader_paddr;
+
+	/* Pointers to the shader recs.  These paddr gets incremented as CL
+	 * packets are relocated in validate_gl_shader_state, and the vaddrs
+	 * (u and v) get incremented and size decremented as the shader recs
+	 * themselves are validated.
+	 */
+	void *shader_rec_u;
+	void *shader_rec_v;
+	uint32_t shader_rec_p;
+	uint32_t shader_rec_size;
+
+	/* Pointers to the uniform data.  These pointers are incremented, and
+	 * size decremented, as each batch of uniforms is uploaded.
+	 */
+	void *uniforms_u;
+	void *uniforms_v;
+	uint32_t uniforms_p;
+	uint32_t uniforms_size;
+};
+
+/**
+ * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
+ * setup parameters.
+ *
+ * This will be used at draw time to relocate the reference to the texture
+ * contents in p0, and validate that the offset combined with
+ * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO.
+ * Note that the hardware treats unprovided config parameters as 0, so not all
+ * of them need to be set up for every texure sample, and we'll store ~0 as
+ * the offset to mark the unused ones.
+ *
+ * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit
+ * Setup") for definitions of the texture parameters.
+ */
+struct vc4_texture_sample_info {
+	uint32_t p_offset[4];
+};
+
+/**
+ * struct vc4_validated_shader_info - information about validated shaders that
+ * needs to be used from command list validation.
+ *
+ * For a given shader, each time a shader state record references it, we need
+ * to verify that the shader doesn't read more uniforms than the shader state
+ * record's uniform BO pointer can provide, and we need to apply relocations
+ * and validate the shader state record's uniforms that define the texture
+ * samples.
+ */
+struct vc4_validated_shader_info
+{
+	uint32_t uniforms_size;
+	uint32_t uniforms_src_size;
+	uint32_t num_texture_samples;
+	struct vc4_texture_sample_info *texture_samples;
 };
 
 /* vc4_bo.c */
@@ -197,8 +273,8 @@ vc4_validate_cl(struct drm_device *dev,
 		struct exec_info *exec);
 
 int
-vc4_validate_shader_recs(struct drm_device *dev,
-			 void *validated,
-			 void *unvalidated,
-			 uint32_t len,
-			 struct exec_info *exec);
+vc4_validate_shader_recs(struct drm_device *dev, struct exec_info *exec);
+
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
+                    uint32_t start_offset);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index b1409a830f6e..854216f92482 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -179,9 +179,9 @@ vc4_submit(struct drm_device *dev, struct exec_info *args)
 static int
 vc4_cl_lookup_bos(struct drm_device *dev,
 		  struct drm_file *file_priv,
-		  struct drm_vc4_submit_cl *args,
 		  struct exec_info *exec)
 {
+	struct drm_vc4_submit_cl *args = exec->args;
 	uint32_t *handles;
 	int ret = 0;
 	int i;
@@ -196,7 +196,7 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	exec->bo = kcalloc(exec->bo_count, sizeof(struct drm_gem_object *),
+	exec->bo = kcalloc(exec->bo_count, sizeof(struct vc4_bo_exec_state),
 			   GFP_KERNEL);
 	if (!exec->bo) {
 		DRM_ERROR("Failed to allocate validated BO pointers\n");
@@ -226,7 +226,7 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 			ret = -EINVAL;
 			goto fail;
 		}
-		exec->bo[i] = (struct drm_gem_cma_object *)bo;
+		exec->bo[i].bo = (struct drm_gem_cma_object *)bo;
 	}
 
 fail:
@@ -235,23 +235,25 @@ fail:
 }
 
 static int
-vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
-		struct exec_info *exec)
+vc4_cl_validate(struct drm_device *dev, struct exec_info *exec)
 {
+	struct drm_vc4_submit_cl *args = exec->args;
 	void *temp = NULL;
-	void *bin, *render, *shader_rec;
+	void *bin, *render;
 	int ret = 0;
 	uint32_t bin_offset = 0;
-	uint32_t render_offset = bin_offset + args->bin_cl_len;
+	uint32_t render_offset = bin_offset + args->bin_cl_size;
 	uint32_t shader_rec_offset = roundup(render_offset +
-					     args->render_cl_len, 16);
-	uint32_t exec_size = shader_rec_offset + args->shader_record_len;
+					     args->render_cl_size, 16);
+	uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
+	uint32_t exec_size = uniforms_offset + args->uniforms_size;
 	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
-					  args->shader_record_count);
+					  args->shader_rec_count);
 
 	if (shader_rec_offset < render_offset ||
-	    exec_size < shader_rec_offset ||
-	    args->shader_record_count >= (UINT_MAX /
+	    uniforms_offset < shader_rec_offset ||
+	    exec_size < uniforms_offset ||
+	    args->shader_rec_count >= (UINT_MAX /
 					  sizeof(struct vc4_shader_state)) ||
 	    temp_size < exec_size) {
 		DRM_ERROR("overflow in exec arguments\n");
@@ -274,29 +276,37 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	}
 	bin = temp + bin_offset;
 	render = temp + render_offset;
-	shader_rec = temp + shader_rec_offset;
+	exec->shader_rec_u = temp + shader_rec_offset;
+	exec->uniforms_u = temp + uniforms_offset;
 	exec->shader_state = temp + exec_size;
-	exec->shader_state_size = args->shader_record_count;
+	exec->shader_state_size = args->shader_rec_count;
 
-	ret = copy_from_user(bin, args->bin_cl, args->bin_cl_len);
+	ret = copy_from_user(bin, args->bin_cl, args->bin_cl_size);
 	if (ret) {
 		DRM_ERROR("Failed to copy in bin cl\n");
 		goto fail;
 	}
 
-	ret = copy_from_user(render, args->render_cl, args->render_cl_len);
+	ret = copy_from_user(render, args->render_cl, args->render_cl_size);
 	if (ret) {
 		DRM_ERROR("Failed to copy in render cl\n");
 		goto fail;
 	}
 
-	ret = copy_from_user(shader_rec, args->shader_records,
-			     args->shader_record_len);
+	ret = copy_from_user(exec->shader_rec_u, args->shader_rec,
+			     args->shader_rec_size);
 	if (ret) {
 		DRM_ERROR("Failed to copy in shader recs\n");
 		goto fail;
 	}
 
+	ret = copy_from_user(exec->uniforms_u, args->uniforms,
+			     args->uniforms_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in uniforms cl\n");
+		goto fail;
+	}
+
 	exec->exec_bo = drm_gem_cma_create(dev, exec_size);
 	if (IS_ERR(exec->exec_bo)) {
 		DRM_ERROR("Couldn't allocate BO for exec\n");
@@ -306,15 +316,20 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	}
 
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
-	exec->ct0ea = exec->ct0ca + args->bin_cl_len;
 	exec->ct1ca = exec->exec_bo->paddr + render_offset;
-	exec->ct1ea = exec->ct1ca + args->render_cl_len;
-	exec->shader_paddr = exec->exec_bo->paddr + shader_rec_offset;
+
+	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
+	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
+	exec->shader_rec_size = args->shader_rec_size;
+
+	exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
+	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
+	exec->uniforms_size = args->uniforms_size;
 
 	ret = vc4_validate_cl(dev,
 			      exec->exec_bo->vaddr + bin_offset,
 			      bin,
-			      args->bin_cl_len,
+			      args->bin_cl_size,
 			      true,
 			      exec);
 	if (ret)
@@ -323,17 +338,13 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	ret = vc4_validate_cl(dev,
 			      exec->exec_bo->vaddr + render_offset,
 			      render,
-			      args->render_cl_len,
+			      args->render_cl_size,
 			      false,
 			      exec);
 	if (ret)
 		goto fail;
 
-	ret = vc4_validate_shader_recs(dev,
-				       exec->exec_bo->vaddr + shader_rec_offset,
-				       shader_rec,
-				       args->shader_record_len,
-				       exec);
+	ret = vc4_validate_shader_recs(dev, exec);
 
 fail:
 	kfree(temp);
@@ -349,20 +360,20 @@ int
 vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv)
 {
-	struct drm_vc4_submit_cl *args = data;
 	struct exec_info exec;
 	int ret;
 	int i;
 
 	memset(&exec, 0, sizeof(exec));
+	exec.args = data;
 
 	mutex_lock(&dev->struct_mutex);
 
-	ret = vc4_cl_lookup_bos(dev, file_priv, args, &exec);
+	ret = vc4_cl_lookup_bos(dev, file_priv, &exec);
 	if (ret)
 		goto fail;
 
-	ret = vc4_cl_validate(dev, args, &exec);
+	ret = vc4_cl_validate(dev, &exec);
 	if (ret)
 		goto fail;
 
@@ -375,7 +386,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 fail:
 	if (exec.bo) {
 		for (i = 0; i < exec.bo_count; i++)
-			drm_gem_object_unreference(&exec.bo[i]->base);
+			drm_gem_object_unreference(&exec.bo[i].bo->base);
 		kfree(exec.bo);
 	}
 
diff --git a/drivers/gpu/drm/vc4/vc4_packet.h b/drivers/gpu/drm/vc4/vc4_packet.h
index cc3786677782..e455c2fe76a5 100644
--- a/drivers/gpu/drm/vc4/vc4_packet.h
+++ b/drivers/gpu/drm/vc4/vc4_packet.h
@@ -72,10 +72,25 @@ enum vc4_packet {
         VC4_PACKET_TILE_RENDERING_MODE_CONFIG = 113,
         VC4_PACKET_CLEAR_COLORS = 114,
         VC4_PACKET_TILE_COORDINATES = 115,
-        GEM_HANDLES = 254,
+
+        /* Not an actual hardware packet -- this is what we use to put
+         * references to GEM bos in the command stream, since we need the u32
+         * int the actual address packet in order to store the offset from the
+         * start of the BO.
+         */
+        VC4_PACKET_GEM_HANDLES = 254,
 } __attribute__ ((__packed__));
 
 /** @{
+ * Bits used by packets like VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_TILE_RENDERING_MODE_CONFIG.
+*/
+#define VC4_TILING_FORMAT_LINEAR    0
+#define VC4_TILING_FORMAT_T         1
+#define VC4_TILING_FORMAT_LT        2
+/** @} */
+
+/** @{
  *
  * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address)
@@ -100,6 +115,7 @@ enum vc4_packet {
 #define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         (0 << 0)
 #define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    (1 << 0)
 #define VC4_LOADSTORE_TILE_BUFFER_BGR565           (2 << 0)
+#define VC4_LOADSTORE_TILE_BUFFER_MASK             (3 << 0)
 /** @} */
 
 /** @{
@@ -111,9 +127,10 @@ enum vc4_packet {
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4     (1 << 6)
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16    (2 << 6)
 
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_RASTER    (0 << 4)
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_T         (1 << 4)
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_LT        (2 << 4)
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      (3 << 4)
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     4
+
 
 #define VC4_LOADSTORE_TILE_BUFFER_NONE             (0 << 0)
 #define VC4_LOADSTORE_TILE_BUFFER_COLOR            (1 << 0)
@@ -188,9 +205,9 @@ enum vc4_packet {
 #define VC4_RENDER_CONFIG_COVERAGE_MODE            (1 << 9)
 #define VC4_RENDER_CONFIG_ENABLE_VG_MASK           (1 << 8)
 
-#define VC4_RENDER_CONFIG_MEMORY_FORMAT_LINEAR     (0 << 6)
-#define VC4_RENDER_CONFIG_MEMORY_FORMAT_T          (1 << 6)
-#define VC4_RENDER_CONFIG_MEMORY_FORMAT_LT         (2 << 6)
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       (3 << 6)
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT      6
 
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_1X         (0 << 4)
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_4X         (1 << 4)
@@ -199,6 +216,7 @@ enum vc4_packet {
 #define VC4_RENDER_CONFIG_FORMAT_BGR565            (0 << 2)
 #define VC4_RENDER_CONFIG_FORMAT_RGBA8888          (1 << 2)
 #define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   (2 << 2)
+#define VC4_RENDER_CONFIG_FORMAT_MASK              (3 << 2)
 
 #define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT        (1 << 1)
 #define VC4_RENDER_CONFIG_MS_MODE_4X               (1 << 0)
diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
new file mode 100644
index 000000000000..e47c994d36bf
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_QPU_DEFINES_H
+#define VC4_QPU_DEFINES_H
+
+enum qpu_op_add {
+        QPU_A_NOP,
+        QPU_A_FADD,
+        QPU_A_FSUB,
+        QPU_A_FMIN,
+        QPU_A_FMAX,
+        QPU_A_FMINABS,
+        QPU_A_FMAXABS,
+        QPU_A_FTOI,
+        QPU_A_ITOF,
+        QPU_A_ADD = 12,
+        QPU_A_SUB,
+        QPU_A_SHR,
+        QPU_A_ASR,
+        QPU_A_ROR,
+        QPU_A_SHL,
+        QPU_A_MIN,
+        QPU_A_MAX,
+        QPU_A_AND,
+        QPU_A_OR,
+        QPU_A_XOR,
+        QPU_A_NOT,
+        QPU_A_CLZ,
+        QPU_A_V8ADDS = 30,
+        QPU_A_V8SUBS = 31,
+};
+
+enum qpu_op_mul {
+        QPU_M_NOP,
+        QPU_M_FMUL,
+        QPU_M_MUL24,
+        QPU_M_V8MULD,
+        QPU_M_V8MIN,
+        QPU_M_V8MAX,
+        QPU_M_V8ADDS,
+        QPU_M_V8SUBS,
+};
+
+enum qpu_raddr {
+        QPU_R_FRAG_PAYLOAD_ZW = 15, /* W for A file, Z for B file */
+        /* 0-31 are the plain regfile a or b fields */
+        QPU_R_UNIF = 32,
+        QPU_R_VARY = 35,
+        QPU_R_ELEM_QPU = 38,
+        QPU_R_NOP,
+        QPU_R_XY_PIXEL_COORD = 41,
+        QPU_R_MS_REV_FLAGS = 41,
+        QPU_R_VPM = 48,
+        QPU_R_VPM_LD_BUSY,
+        QPU_R_VPM_LD_WAIT,
+        QPU_R_MUTEX_ACQUIRE,
+};
+
+enum qpu_waddr {
+        /* 0-31 are the plain regfile a or b fields */
+        QPU_W_ACC0 = 32, /* aka r0 */
+        QPU_W_ACC1,
+        QPU_W_ACC2,
+        QPU_W_ACC3,
+        QPU_W_TMU_NOSWAP,
+        QPU_W_ACC5,
+        QPU_W_HOST_INT,
+        QPU_W_NOP,
+        QPU_W_UNIFORMS_ADDRESS,
+        QPU_W_QUAD_XY, /* X for regfile a, Y for regfile b */
+        QPU_W_MS_FLAGS = 42,
+        QPU_W_REV_FLAG = 42,
+        QPU_W_TLB_STENCIL_SETUP = 43,
+        QPU_W_TLB_Z,
+        QPU_W_TLB_COLOR_MS,
+        QPU_W_TLB_COLOR_ALL,
+        QPU_W_TLB_ALPHA_MASK,
+        QPU_W_VPM,
+        QPU_W_VPMVCD_SETUP, /* LD for regfile a, ST for regfile b */
+        QPU_W_VPM_ADDR, /* LD for regfile a, ST for regfile b */
+        QPU_W_MUTEX_RELEASE,
+        QPU_W_SFU_RECIP,
+        QPU_W_SFU_RECIPSQRT,
+        QPU_W_SFU_EXP,
+        QPU_W_SFU_LOG,
+        QPU_W_TMU0_S,
+        QPU_W_TMU0_T,
+        QPU_W_TMU0_R,
+        QPU_W_TMU0_B,
+        QPU_W_TMU1_S,
+        QPU_W_TMU1_T,
+        QPU_W_TMU1_R,
+        QPU_W_TMU1_B,
+};
+
+enum qpu_sig_bits {
+        QPU_SIG_SW_BREAKPOINT,
+        QPU_SIG_NONE,
+        QPU_SIG_THREAD_SWITCH,
+        QPU_SIG_PROG_END,
+        QPU_SIG_WAIT_FOR_SCOREBOARD,
+        QPU_SIG_SCOREBOARD_UNLOCK,
+        QPU_SIG_LAST_THREAD_SWITCH,
+        QPU_SIG_COVERAGE_LOAD,
+        QPU_SIG_COLOR_LOAD,
+        QPU_SIG_COLOR_LOAD_END,
+        QPU_SIG_LOAD_TMU0,
+        QPU_SIG_LOAD_TMU1,
+        QPU_SIG_ALPHA_MASK_LOAD,
+        QPU_SIG_SMALL_IMM,
+        QPU_SIG_LOAD_IMM,
+        QPU_SIG_BRANCH
+};
+
+enum qpu_mux {
+        /* hardware mux values */
+        QPU_MUX_R0,
+        QPU_MUX_R1,
+        QPU_MUX_R2,
+        QPU_MUX_R3,
+        QPU_MUX_R4,
+        QPU_MUX_R5,
+        QPU_MUX_A,
+        QPU_MUX_B,
+
+        /* non-hardware mux values */
+        QPU_MUX_IMM,
+};
+
+enum qpu_cond {
+        QPU_COND_NEVER,
+        QPU_COND_ALWAYS,
+        QPU_COND_ZS,
+        QPU_COND_ZC,
+        QPU_COND_NS,
+        QPU_COND_NC,
+        QPU_COND_CS,
+        QPU_COND_CC,
+};
+
+enum qpu_pack_mul {
+        QPU_PACK_MUL_NOP,
+        QPU_PACK_MUL_8888 = 3, /* replicated to each 8 bits of the 32-bit dst. */
+        QPU_PACK_MUL_8A,
+        QPU_PACK_MUL_8B,
+        QPU_PACK_MUL_8C,
+        QPU_PACK_MUL_8D,
+};
+
+enum qpu_pack_a {
+        QPU_PACK_A_NOP,
+        /* convert to 16 bit float if float input, or to int16. */
+        QPU_PACK_A_16A,
+        QPU_PACK_A_16B,
+        /* replicated to each 8 bits of the 32-bit dst. */
+        QPU_PACK_A_8888,
+        /* Convert to 8-bit unsigned int. */
+        QPU_PACK_A_8A,
+        QPU_PACK_A_8B,
+        QPU_PACK_A_8C,
+        QPU_PACK_A_8D,
+
+        /* Saturating variants of the previous instructions. */
+        QPU_PACK_A_32_SAT, /* int-only */
+        QPU_PACK_A_16A_SAT, /* int or float */
+        QPU_PACK_A_16B_SAT,
+        QPU_PACK_A_8888_SAT,
+        QPU_PACK_A_8A_SAT,
+        QPU_PACK_A_8B_SAT,
+        QPU_PACK_A_8C_SAT,
+        QPU_PACK_A_8D_SAT,
+};
+
+enum qpu_unpack_r4 {
+        QPU_UNPACK_R4_NOP,
+        QPU_UNPACK_R4_F16A_TO_F32,
+        QPU_UNPACK_R4_F16B_TO_F32,
+        QPU_UNPACK_R4_8D_REP,
+        QPU_UNPACK_R4_8A,
+        QPU_UNPACK_R4_8B,
+        QPU_UNPACK_R4_8C,
+        QPU_UNPACK_R4_8D,
+};
+
+#define QPU_MASK(high, low) ((((uint64_t)1<<((high)-(low)+1))-1)<<(low))
+/* Using the GNU statement expression extension */
+#define QPU_SET_FIELD(value, field)                                       \
+        ({                                                                \
+                uint64_t fieldval = (uint64_t)(value) << field ## _SHIFT; \
+                assert((fieldval & ~ field ## _MASK) == 0);               \
+                fieldval & field ## _MASK;                                \
+         })
+
+#define QPU_GET_FIELD(word, field) ((uint32_t)(((word)  & field ## _MASK) >> field ## _SHIFT))
+
+#define QPU_SIG_SHIFT                   60
+#define QPU_SIG_MASK                    QPU_MASK(63, 60)
+
+#define QPU_UNPACK_SHIFT                57
+#define QPU_UNPACK_MASK                 QPU_MASK(59, 57)
+
+/**
+ * If set, the pack field means PACK_MUL or R4 packing, instead of normal
+ * regfile a packing.
+ */
+#define QPU_PM                          ((uint64_t)1 << 56)
+
+#define QPU_PACK_SHIFT                  52
+#define QPU_PACK_MASK                   QPU_MASK(55, 52)
+
+#define QPU_COND_ADD_SHIFT              49
+#define QPU_COND_ADD_MASK               QPU_MASK(51, 49)
+#define QPU_COND_MUL_SHIFT              46
+#define QPU_COND_MUL_MASK               QPU_MASK(48, 46)
+
+#define QPU_SF                          ((uint64_t)1 << 45)
+
+#define QPU_WADDR_ADD_SHIFT             38
+#define QPU_WADDR_ADD_MASK              QPU_MASK(43, 38)
+#define QPU_WADDR_MUL_SHIFT             32
+#define QPU_WADDR_MUL_MASK              QPU_MASK(37, 32)
+
+#define QPU_OP_MUL_SHIFT                29
+#define QPU_OP_MUL_MASK                 QPU_MASK(31, 29)
+
+#define QPU_RADDR_A_SHIFT               18
+#define QPU_RADDR_A_MASK                QPU_MASK(23, 18)
+#define QPU_RADDR_B_SHIFT               12
+#define QPU_RADDR_B_MASK                QPU_MASK(17, 12)
+#define QPU_SMALL_IMM_SHIFT             12
+#define QPU_SMALL_IMM_MASK              QPU_MASK(17, 12)
+
+#define QPU_ADD_A_SHIFT                 9
+#define QPU_ADD_A_MASK                  QPU_MASK(11, 9)
+#define QPU_ADD_B_SHIFT                 6
+#define QPU_ADD_B_MASK                  QPU_MASK(8, 6)
+#define QPU_MUL_A_SHIFT                 3
+#define QPU_MUL_A_MASK                  QPU_MASK(5, 3)
+#define QPU_MUL_B_SHIFT                 0
+#define QPU_MUL_B_MASK                  QPU_MASK(2, 0)
+
+#define QPU_WS                          ((uint64_t)1 << 44)
+
+#define QPU_OP_ADD_SHIFT                24
+#define QPU_OP_ADD_MASK                 QPU_MASK(28, 24)
+
+#endif /* VC4_QPU_DEFINES_H */
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index a68d331f0497..5cf4643f8c0b 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -41,40 +41,221 @@
 
 #include "vc4_drv.h"
 #include "vc4_packet.h"
-#include "vc4_regs.h"
 
 #define VALIDATE_ARGS \
 	struct exec_info *exec,				\
 	void *validated,				\
 	void *untrusted
 
+static bool
+vc4_use_bo(struct exec_info *exec,
+	   uint32_t hindex,
+	   enum vc4_bo_mode mode,
+	   struct drm_gem_cma_object **obj)
+{
+	*obj = NULL;
+
+	if (hindex >= exec->bo_count) {
+		DRM_ERROR("BO index %d greater than BO count %d\n",
+			  hindex, exec->bo_count);
+		return false;
+	}
+
+	if (exec->bo[hindex].mode != mode) {
+		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
+			exec->bo[hindex].mode = mode;
+		} else {
+			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
+				  hindex, exec->bo[hindex].mode, mode);
+			return false;
+		}
+	}
+
+	*obj = exec->bo[hindex].bo;
+	return true;
+}
+
+static bool
+vc4_use_handle(struct exec_info *exec,
+	       uint32_t gem_handles_packet_index,
+	       enum vc4_bo_mode mode,
+	       struct drm_gem_cma_object **obj)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
+			  mode, obj);
+}
+
+static uint32_t
+gl_shader_rec_size(uint32_t pointer_bits)
+{
+	uint32_t attribute_count = pointer_bits & 7;
+	bool extended = pointer_bits & 8;
+
+	if (attribute_count == 0)
+		attribute_count = 8;
+
+	if (extended)
+		return 100 + attribute_count * 4;
+	else
+		return 36 + attribute_count * 8;
+}
+
+static bool
+check_tex_size(struct exec_info *exec, struct drm_gem_cma_object *fbo,
+	       uint32_t offset, uint8_t tiling_format,
+	       uint32_t width, uint32_t height, uint8_t cpp)
+{
+	uint32_t width_align, height_align;
+	uint32_t aligned_row_len, aligned_h, size;
+
+	switch (tiling_format) {
+	case VC4_TILING_FORMAT_LINEAR:
+		width_align = 16;
+		height_align = 1;
+		break;
+	case VC4_TILING_FORMAT_T:
+		width_align = 128;
+		height_align = 32;
+		break;
+	case VC4_TILING_FORMAT_LT:
+		width_align = 16;
+		height_align = 4;
+		break;
+	default:
+		DRM_ERROR("buffer tiling %d unsupported\n", tiling_format);
+		return false;
+	}
+
+	/* The values are limited by the packet/texture parameter bitfields,
+	 * so we don't need to worry as much about integer overflow.
+	 */
+	BUG_ON(width > 65535);
+	BUG_ON(height > 65535);
+
+	aligned_row_len = roundup(width * cpp, width_align);
+	aligned_h = roundup(height, height_align);
+
+	if (INT_MAX / aligned_row_len < aligned_h) {
+		DRM_ERROR("Overflow in fbo size (%d * %d)\n",
+			  aligned_row_len, aligned_h);
+		return false;
+	}
+	size = aligned_row_len * aligned_h;
+
+	if (size + offset < size ||
+	    size + offset > fbo->base.size) {
+		DRM_ERROR("Overflow in %dx%d fbo size (%d + %d > %d)\n",
+			  width, height, size, offset, fbo->base.size);
+		return false;
+	}
+
+	return true;
+}
+
+static int
+validate_start_tile_binning(VALIDATE_ARGS)
+{
+	if (exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
+	exec->found_start_tile_binning_packet = true;
+
+	if (!exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("missing VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int
 validate_branch_to_sublist(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *target;
+	uint32_t offset;
+
+	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &target))
+		return -EINVAL;
 
-	/* XXX: Validate address jumped to */
+	if (target != exec->tile_alloc_bo) {
+		DRM_ERROR("Jumping to BOs other than tile alloc unsupported\n");
+		return -EINVAL;
+	}
 
-	target = exec->bo[exec->bo_index[0]];
+	offset = *(uint32_t *)(untrusted + 0);
+	if (offset % exec->tile_alloc_init_block_size ||
+	    offset / exec->tile_alloc_init_block_size >
+	    exec->bin_tiles_x * exec->bin_tiles_y) {
+		DRM_ERROR("VC4_PACKET_BRANCH_TO_SUB_LIST must jump to initial "
+			  "tile allocation space.\n");
+		return -EINVAL;
+	}
 
-	*(uint32_t *)(validated + 0) =
-		*(uint32_t *)(untrusted + 0) + target->paddr;
+	*(uint32_t *)(validated + 0) = target->paddr + offset;
 
 	return 0;
 }
 
+/**
+ * validate_loadstore_tile_buffer_general() - Validation for
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_STORE_TILE_BUFFER_GENERAL.
+ *
+ * The two packets are nearly the same, except for the TLB-clearing management
+ * bits not being present for loads.  Additionally, while stores are executed
+ * immediately (using the current tile coordinates), loads are queued to be
+ * executed when the tile coordinates packet occurs.
+ *
+ * Note that coordinates packets are validated to be within the declared
+ * bin_x/y, which themselves are verified to match the rendering-configuration
+ * FB width and height (which the hardware uses to clip loads and stores).
+ */
 static int
 validate_loadstore_tile_buffer_general(VALIDATE_ARGS)
 {
 	uint32_t packet_b0 = *(uint8_t *)(untrusted + 0);
-	struct drm_gem_cma_object *fbo = exec->bo[exec->bo_index[0]];
+	uint32_t packet_b1 = *(uint8_t *)(untrusted + 1);
+	struct drm_gem_cma_object *fbo;
+	uint32_t buffer_type = packet_b0 & 0xf;
+	uint32_t offset, cpp;
 
-	if ((packet_b0 & 0xf) == VC4_LOADSTORE_TILE_BUFFER_NONE)
+	switch (buffer_type) {
+	case VC4_LOADSTORE_TILE_BUFFER_NONE:
 		return 0;
+	case VC4_LOADSTORE_TILE_BUFFER_COLOR:
+		if ((packet_b1 & VC4_LOADSTORE_TILE_BUFFER_MASK) ==
+		    VC4_LOADSTORE_TILE_BUFFER_RGBA8888) {
+			cpp = 4;
+		} else {
+			cpp = 2;
+		}
+		break;
 
-	/* XXX: Validate address offset */
-	*(uint32_t *)(validated + 2) =
-		*(uint32_t *)(untrusted + 2) + fbo->paddr;
+	case VC4_LOADSTORE_TILE_BUFFER_Z:
+	case VC4_LOADSTORE_TILE_BUFFER_ZS:
+		cpp = 4;
+		break;
+
+	default:
+		DRM_ERROR("Load/store type %d unsupported\n", buffer_type);
+		return -EINVAL;
+	}
+
+	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
+		return -EINVAL;
+
+	offset = *(uint32_t *)(untrusted + 2);
+
+	if (!check_tex_size(exec, fbo, offset,
+			    ((packet_b0 &
+			      VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK) >>
+			     VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT),
+			    exec->fb_width, exec->fb_height, cpp)) {
+		return -EINVAL;
+	}
+
+	*(uint32_t *)(validated + 2) = offset + fbo->paddr;
 
 	return 0;
 }
@@ -83,30 +264,60 @@ static int
 validate_indexed_prim_list(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *ib;
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t offset = *(uint32_t *)(untrusted + 5);
 	uint32_t max_index = *(uint32_t *)(untrusted + 9);
 	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
-	uint32_t ib_access_end = (max_index + 1) * index_size;
+	struct vc4_shader_state *shader_state;
 
 	/* Check overflow condition */
-	if (max_index == ~0) {
-		DRM_ERROR("unlimited max index\n");
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
 		return -EINVAL;
 	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
 
-	if (ib_access_end < max_index) {
-		DRM_ERROR("IB access overflow\n");
+	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+		return -EINVAL;
+
+	if (offset > ib->base.size ||
+	    (ib->base.size - offset) / index_size < length) {
+		DRM_ERROR("IB access overflow (%d + %d*%d > %d)\n",
+			  offset, length, index_size, ib->base.size);
 		return -EINVAL;
 	}
 
-	ib = exec->bo[exec->bo_index[0]];
-	if (ib_access_end > ib->base.size) {
-		DRM_ERROR("IB access out of bounds (%d/%d)\n",
-			  ib_access_end, ib->base.size);
+	*(uint32_t *)(validated + 5) = ib->paddr + offset;
+
+	return 0;
+}
+
+static int
+validate_gl_array_primitive(VALIDATE_ARGS)
+{
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t base_index = *(uint32_t *)(untrusted + 5);
+	uint32_t max_index;
+	struct vc4_shader_state *shader_state;
+
+	/* Check overflow condition */
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
 		return -EINVAL;
 	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
 
-	*(uint32_t *)(validated + 5) =
-		*(uint32_t *)(untrusted + 5) + ib->paddr;
+	if (length + base_index < length) {
+		DRM_ERROR("primitive vertex count overflow\n");
+		return -EINVAL;
+	}
+	max_index = length + base_index - 1;
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
 
 	return 0;
 }
@@ -114,11 +325,8 @@ validate_indexed_prim_list(VALIDATE_ARGS)
 static int
 validate_gl_shader_state(VALIDATE_ARGS)
 {
-	struct drm_gem_cma_object *shader;
 	uint32_t i = exec->shader_state_count++;
 
-	shader = exec->bo[exec->bo_index[0]];
-
 	if (i >= exec->shader_state_size) {
 		DRM_ERROR("More requests for shader states than declared\n");
 		return -EINVAL;
@@ -126,9 +334,18 @@ validate_gl_shader_state(VALIDATE_ARGS)
 
 	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
 	exec->shader_state[i].addr = *(uint32_t *)untrusted;
+	exec->shader_state[i].max_index = 0;
+
+	if (exec->shader_state[i].addr & ~0xf) {
+		DRM_ERROR("high bits set in GL shader rec reference\n");
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = (exec->shader_rec_p +
+				  exec->shader_state[i].addr);
 
-	*(uint32_t *)validated = exec->shader_state[i].addr +
-		exec->shader_paddr;
+	exec->shader_rec_p +=
+		roundup(gl_shader_rec_size(exec->shader_state[i].addr), 16);
 
 	return 0;
 }
@@ -152,8 +369,8 @@ validate_nv_shader_state(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	*(uint32_t *)validated =
-		exec->shader_state[i].addr + exec->shader_paddr;
+	*(uint32_t *)validated = (exec->shader_state[i].addr +
+				  exec->shader_rec_p);
 
 	return 0;
 }
@@ -163,16 +380,79 @@ validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *tile_allocation;
 	struct drm_gem_cma_object *tile_state_data_array;
+	uint8_t flags;
+	uint32_t tile_allocation_size;
+
+	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &tile_allocation) ||
+	    !vc4_use_handle(exec, 1, VC4_MODE_TSDA, &tile_state_data_array))
+		return -EINVAL;
+
+	if (exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+	exec->found_tile_binning_mode_config_packet = true;
+
+	exec->bin_tiles_x = *(uint8_t *)(untrusted + 12);
+	exec->bin_tiles_y = *(uint8_t *)(untrusted + 13);
+	flags = *(uint8_t *)(untrusted + 14);
 
-	tile_allocation = exec->bo[exec->bo_index[0]];
-	tile_state_data_array = exec->bo[exec->bo_index[1]];
+	if (exec->bin_tiles_x == 0 ||
+	    exec->bin_tiles_y == 0) {
+		DRM_ERROR("Tile binning config of %dx%d too small\n",
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
 
-	/* XXX: Validate offsets */
-	*(uint32_t *)validated =
-		*(uint32_t *)untrusted + tile_allocation->paddr;
+	/* Our validation relies on the user not getting to set up their own
+	 * tile state/tile allocation BO contents.
+	 */
+	if (!(flags & VC4_BIN_CONFIG_AUTO_INIT_TSDA)) {
+		DRM_ERROR("binning config missing "
+			  "VC4_BIN_CONFIG_AUTO_INIT_TSDA\n");
+		return -EINVAL;
+	}
 
-	*(uint32_t *)(validated + 8) =
-		*(uint32_t *)(untrusted + 8) + tile_state_data_array->paddr;
+	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
+		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT |
+		     VC4_BIN_CONFIG_MS_MODE_4X)) {
+		DRM_ERROR("unsupported bining config flags 0x%02x\n", flags);
+		return -EINVAL;
+	}
+
+	if (*(uint32_t *)(untrusted + 0) != 0) {
+		DRM_ERROR("tile allocation offset != 0 unsupported\n");
+		return -EINVAL;
+	}
+	tile_allocation_size = *(uint32_t *)(untrusted + 4);
+	if (tile_allocation_size > tile_allocation->base.size) {
+		DRM_ERROR("tile allocation size %d > BO size %d\n",
+			  tile_allocation_size, tile_allocation->base.size);
+		return -EINVAL;
+	}
+	*(uint32_t *)validated = tile_allocation->paddr;
+	exec->tile_alloc_bo = tile_allocation;
+
+	exec->tile_alloc_init_block_size = 1 << (5 + ((flags >> 5) & 3));
+	if (exec->bin_tiles_x * exec->bin_tiles_y *
+	    exec->tile_alloc_init_block_size > tile_allocation_size) {
+		DRM_ERROR("tile init exceeds tile alloc size (%d vs %d)\n",
+			  exec->bin_tiles_x * exec->bin_tiles_y *
+			  exec->tile_alloc_init_block_size,
+			  tile_allocation_size);
+		return -EINVAL;
+	}
+	if (*(uint32_t *)(untrusted + 8) != 0) {
+		DRM_ERROR("TSDA offset != 0 unsupported\n");
+		return -EINVAL;
+	}
+	if (exec->bin_tiles_x * exec->bin_tiles_y * 48 >
+	    tile_state_data_array->base.size) {
+		DRM_ERROR("TSDA of %db too small for %dx%d bin config\n",
+			  tile_state_data_array->base.size,
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+	}
+	*(uint32_t *)(validated + 8) = tile_state_data_array->paddr;
 
 	return 0;
 }
@@ -181,34 +461,83 @@ static int
 validate_tile_rendering_mode_config(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *fbo;
+	uint32_t flags, offset, cpp;
 
-	fbo = exec->bo[exec->bo_index[0]];
+	if (exec->found_tile_rendering_mode_config_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+	exec->found_tile_rendering_mode_config_packet = true;
 
-	/* XXX: Validate offsets */
-	*(uint32_t *)validated =
-		*(uint32_t *)untrusted + fbo->paddr;
+	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
+		return -EINVAL;
+
+	exec->fb_width = *(uint16_t *)(untrusted + 4);
+	exec->fb_height = *(uint16_t *)(untrusted + 6);
+
+	/* Make sure that the fb width/height matches the binning config -- we
+	 * rely on being able to interchange these for various assertions.
+	 * (Within a tile, loads and stores will be clipped to the
+	 * width/height, but we allow load/storing to any binned tile).
+	 */
+	if (exec->fb_width <= (exec->bin_tiles_x - 1) * 64 ||
+	    exec->fb_width > exec->bin_tiles_x * 64 ||
+	    exec->fb_height <= (exec->bin_tiles_y - 1) * 64 ||
+	    exec->fb_height > exec->bin_tiles_y * 64) {
+		DRM_ERROR("bin config %dx%d doesn't match FB %dx%d\n",
+			  exec->bin_tiles_x, exec->bin_tiles_y,
+			  exec->fb_width, exec->fb_height);
+		return -EINVAL;
+	}
+
+	flags = *(uint16_t *)(untrusted + 8);
+	if ((flags & VC4_RENDER_CONFIG_FORMAT_MASK) ==
+	    VC4_RENDER_CONFIG_FORMAT_RGBA8888) {
+		cpp = 4;
+	} else {
+		cpp = 2;
+	}
+
+	offset = *(uint32_t *)untrusted;
+	if (!check_tex_size(exec, fbo, offset,
+			    ((flags &
+			      VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK) >>
+			     VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT),
+			    exec->fb_width, exec->fb_height, cpp)) {
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = fbo->paddr + offset;
 
 	return 0;
 }
 
 static int
-validate_gem_handles(VALIDATE_ARGS)
+validate_tile_coordinates(VALIDATE_ARGS)
 {
-	int i;
-
-	memcpy(exec->bo_index, untrusted, sizeof(exec->bo_index));
-
-	for (i = 0; i < ARRAY_SIZE(exec->bo_index); i++) {
-		if (exec->bo_index[i] >= exec->bo_count) {
-			DRM_ERROR("Validated BO index %d >= %d\n",
-				  exec->bo_index[i], exec->bo_count);
-			return -EINVAL;
-		}
+	uint8_t tile_x = *(uint8_t *)(untrusted + 0);
+	uint8_t tile_y = *(uint8_t *)(untrusted + 1);
+
+	if (tile_x >= exec->bin_tiles_x ||
+	    tile_y >= exec->bin_tiles_y) {
+		DRM_ERROR("Tile coordinates %d,%d > bin config %d,%d\n",
+			  tile_x,
+			  tile_y,
+			  exec->bin_tiles_x,
+			  exec->bin_tiles_y);
+		return -EINVAL;
 	}
 
 	return 0;
 }
 
+static int
+validate_gem_handles(VALIDATE_ARGS)
+{
+	memcpy(exec->bo_index, untrusted, sizeof(exec->bo_index));
+	return 0;
+}
+
 static const struct cmd_info {
 	bool bin;
 	bool render;
@@ -216,63 +545,59 @@ static const struct cmd_info {
 	const char *name;
 	int (*func)(struct exec_info *exec, void *validated, void *untrusted);
 } cmd_info[] = {
-	[0] = { 1, 1, 1, "halt", NULL },
-	[1] = { 1, 1, 1, "nop", NULL },
-	[4] = { 1, 1, 1, "flush", NULL },
-	[5] = { 1, 0, 1, "flush all state", NULL },
-	[6] = { 1, 0, 1, "start tile binning", NULL },
-	[7] = { 1, 0, 1, "increment semaphore", NULL },
-	[8] = { 1, 1, 1, "wait on semaphore", NULL },
-	[17] = { 1, 1, 5, "branch to sublist", validate_branch_to_sublist },
-	[24] = { 0, 1, 1, "store MS resolved tile color buffer", NULL },
-	[25] = { 0, 1, 1, "store MS resolved tile color buffer and EOF", NULL },
-
-	[28] = { 0, 1, 7, "Store Tile Buffer General",
-		 validate_loadstore_tile_buffer_general },
-	[29] = { 0, 1, 7, "Load Tile Buffer General",
-		 validate_loadstore_tile_buffer_general },
-
-	[32] = { 1, 1, 14, "Indexed Primitive List",
-		 validate_indexed_prim_list },
-
-	/* XXX: bounds check verts? */
-	[33] = { 1, 1, 10, "Vertex Array Primitives", NULL },
-
-	[56] = { 1, 1, 2, "primitive list format", NULL }, /* XXX: bin valid? */
-
-	[64] = { 1, 1, 5, "GL Shader State", validate_gl_shader_state },
-	[65] = { 1, 1, 5, "NV Shader State", validate_nv_shader_state },
-
-	[96] = { 1, 1, 4, "configuration bits", NULL },
-	[97] = { 1, 1, 5, "flat shade flags", NULL },
-	[98] = { 1, 1, 5, "point size", NULL },
-	[99] = { 1, 1, 5, "line width", NULL },
-	[100] = { 1, 1, 3, "RHT X boundary", NULL },
-	[101] = { 1, 1, 5, "Depth Offset", NULL },
-	[102] = { 1, 1, 9, "Clip Window", NULL },
-	[103] = { 1, 1, 5, "Viewport Offset", NULL },
-	[105] = { 1, 1, 9, "Clipper XY Scaling", NULL },
+	[VC4_PACKET_HALT] = { 1, 1, 1, "halt", NULL },
+	[VC4_PACKET_NOP] = { 1, 1, 1, "nop", NULL },
+	[VC4_PACKET_FLUSH] = { 1, 1, 1, "flush", NULL },
+	[VC4_PACKET_FLUSH_ALL] = { 1, 0, 1, "flush all state", NULL },
+	[VC4_PACKET_START_TILE_BINNING] = { 1, 0, 1, "start tile binning", validate_start_tile_binning },
+	[VC4_PACKET_INCREMENT_SEMAPHORE] = { 1, 0, 1, "increment semaphore", NULL },
+	[VC4_PACKET_WAIT_ON_SEMAPHORE] = { 1, 1, 1, "wait on semaphore", NULL },
+	/* BRANCH_TO_SUB_LIST is actually supported in the binner as well, but
+	 * we only use it from the render CL in order to jump into the tile
+	 * allocation BO.
+	 */
+	[VC4_PACKET_BRANCH_TO_SUB_LIST] = { 0, 1, 5, "branch to sublist", validate_branch_to_sublist },
+	[VC4_PACKET_STORE_MS_TILE_BUFFER] = { 0, 1, 1, "store MS resolved tile color buffer", NULL },
+	[VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF] = { 0, 1, 1, "store MS resolved tile color buffer and EOF", NULL },
+
+	[VC4_PACKET_STORE_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Store Tile Buffer General", validate_loadstore_tile_buffer_general },
+	[VC4_PACKET_LOAD_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Load Tile Buffer General", validate_loadstore_tile_buffer_general },
+
+	[VC4_PACKET_GL_INDEXED_PRIMITIVE] = { 1, 1, 14, "Indexed Primitive List", validate_indexed_prim_list },
+
+	[VC4_PACKET_GL_ARRAY_PRIMITIVE] = { 1, 1, 10, "Vertex Array Primitives", validate_gl_array_primitive },
+
+	/* This is only used by clipped primitives (packets 48 and 49), which
+	 * we don't support parsing yet.
+	 */
+	[VC4_PACKET_PRIMITIVE_LIST_FORMAT] = { 1, 1, 2, "primitive list format", NULL },
+
+	[VC4_PACKET_GL_SHADER_STATE] = { 1, 1, 5, "GL Shader State", validate_gl_shader_state },
+	[VC4_PACKET_NV_SHADER_STATE] = { 1, 1, 5, "NV Shader State", validate_nv_shader_state },
+
+	[VC4_PACKET_CONFIGURATION_BITS] = { 1, 1, 4, "configuration bits", NULL },
+	[VC4_PACKET_FLAT_SHADE_FLAGS] = { 1, 1, 5, "flat shade flags", NULL },
+	[VC4_PACKET_POINT_SIZE] = { 1, 1, 5, "point size", NULL },
+	[VC4_PACKET_LINE_WIDTH] = { 1, 1, 5, "line width", NULL },
+	[VC4_PACKET_RHT_X_BOUNDARY] = { 1, 1, 3, "RHT X boundary", NULL },
+	[VC4_PACKET_DEPTH_OFFSET] = { 1, 1, 5, "Depth Offset", NULL },
+	[VC4_PACKET_CLIP_WINDOW] = { 1, 1, 9, "Clip Window", NULL },
+	[VC4_PACKET_VIEWPORT_OFFSET] = { 1, 1, 5, "Viewport Offset", NULL },
+	[VC4_PACKET_CLIPPER_XY_SCALING] = { 1, 1, 9, "Clipper XY Scaling", NULL },
 	/* Note: The docs say this was also 105, but it was 106 in the
 	 * initial userland code drop.
 	 */
-	[106] = { 1, 1, 9, "Clipper Z Scale and Offset", NULL },
+	[VC4_PACKET_CLIPPER_Z_SCALING] = { 1, 1, 9, "Clipper Z Scale and Offset", NULL },
 
-	[112] = { 1, 0, 16, "tile binning configuration",
-		  validate_tile_binning_config },
+	[VC4_PACKET_TILE_BINNING_MODE_CONFIG] = { 1, 0, 16, "tile binning configuration", validate_tile_binning_config },
 
-	/* XXX: Do we need to validate this one?  It's got width/height in it.
-	 */
-	[113] = { 0, 1, 11, "tile rendering mode configuration",
-		  validate_tile_rendering_mode_config},
+	[VC4_PACKET_TILE_RENDERING_MODE_CONFIG] = { 0, 1, 11, "tile rendering mode configuration", validate_tile_rendering_mode_config},
 
-	[114] = { 0, 1, 14, "Clear Colors", NULL },
+	[VC4_PACKET_CLEAR_COLORS] = { 0, 1, 14, "Clear Colors", NULL },
 
-	/* XXX: Do we need to validate here?  It's got tile x/y number for
-	 * rendering
-	 */
-	[115] = { 0, 1, 3, "Tile Coordinates", NULL },
+	[VC4_PACKET_TILE_COORDINATES] = { 0, 1, 3, "Tile Coordinates", validate_tile_coordinates },
 
-	[254] = { 1, 1, 9, "GEM handles", validate_gem_handles },
+	[VC4_PACKET_GEM_HANDLES] = { 1, 1, 9, "GEM handles", validate_gem_handles },
 };
 
 int
@@ -326,7 +651,7 @@ vc4_validate_cl(struct drm_device *dev,
 			return -EINVAL;
 		}
 
-		if (cmd != 254)
+		if (cmd != VC4_PACKET_GEM_HANDLES)
 			memcpy(dst_pkt, src_pkt, info->len);
 
 		if (info->func && info->func(exec,
@@ -340,126 +665,322 @@ vc4_validate_cl(struct drm_device *dev,
 
 		src_offset += info->len;
 		/* GEM handle loading doesn't produce HW packets. */
-		if (cmd != 254)
+		if (cmd != VC4_PACKET_GEM_HANDLES)
 			dst_offset += info->len;
 
 		/* When the CL hits halt, it'll stop reading anything else. */
-		if (cmd == 0)
+		if (cmd == VC4_PACKET_HALT)
 			break;
 	}
 
+	if (is_bin) {
+		exec->ct0ea = exec->ct0ca + dst_offset;
+
+		if (!exec->found_start_tile_binning_packet) {
+			DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
+			return -EINVAL;
+		}
+	} else {
+		if (!exec->found_tile_rendering_mode_config_packet) {
+			DRM_ERROR("Render CL missing VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
+			return -EINVAL;
+		}
+		exec->ct1ea = exec->ct1ca + dst_offset;
+	}
+
 	return 0;
 }
 
+static bool
+reloc_tex(struct exec_info *exec,
+	  void *uniform_data_u,
+	  struct vc4_texture_sample_info *sample,
+	  uint32_t texture_handle_index)
+
+{
+	struct drm_gem_cma_object *tex;
+	uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
+	uint32_t p1 = *(uint32_t *)(uniform_data_u + sample->p_offset[1]);
+	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
+	uint32_t offset = p0 & ~0xfff;
+	uint32_t miplevels = (p0 & 0x15);
+	uint32_t width = (p1 >> 8) & 2047;
+	uint32_t height = (p1 >> 20) & 2047;
+	uint32_t type, cpp, tiling_format;
+	uint32_t i;
+
+	if (width == 0)
+		width = 2048;
+	if (height == 0)
+		height = 2048;
+
+	if (p0 & (1 << 9)) {
+		DRM_ERROR("Cube maps unsupported\n");
+		return false;
+	}
+
+	type = ((p0 >> 4) & 15) | ((p1 >> 31) << 4);
+
+	switch (type) {
+	case 0: /* RGBA8888 */
+	case 1: /* RGBX8888 */
+	case 16: /* RGBA32R */
+		cpp = 4;
+		break;
+	case 2: /* RGBA4444 */
+	case 3: /* RGBA5551 */
+	case 4: /* RGB565 */
+	case 7: /* LUMALPHA */
+	case 9: /* S16F */
+	case 11: /* S16 */
+		cpp = 2;
+		break;
+	case 5: /* LUMINANCE */
+	case 6: /* ALPHA */
+	case 10: /* S8 */
+		cpp = 1;
+		break;
+	case 8: /* ETC1 */
+	case 12: /* BW1 */
+	case 13: /* A4 */
+	case 14: /* A1 */
+	case 15: /* RGBA64 */
+	case 17: /* YUV422R */
+	default:
+		DRM_ERROR("Texture format %d unsupported\n", type);
+		return false;
+	}
+
+	if (type == 16) {
+		tiling_format = VC4_TILING_FORMAT_LINEAR;
+	} else {
+		DRM_ERROR("Tiling formats not yet supported\n");
+		return false;
+	}
+
+	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+		return false;
+
+	if (!check_tex_size(exec, tex, offset, tiling_format,
+			    width, height, cpp)) {
+		return false;
+	}
+
+	/* The mipmap levels are stored before the base of the texture.  Make
+	 * sure there is actually space in the BO.
+	 */
+	for (i = 1; i <= miplevels; i++) {
+		uint32_t level_width = roundup(max(width >> i, 1u), 16 / cpp);
+		uint32_t level_height = max(height >> i, 1u);
+		uint32_t level_size = level_width * level_height * cpp;
+
+		if (offset < level_size) {
+			DRM_ERROR("Level %d (%dx%d) size %db overflowed "
+				  "buffer bounds (offset %d)\n",
+				  i, level_width, level_height,
+				  level_size, offset);
+			return false;
+		}
+	}
+
+	*validated_p0 = tex->paddr + p0;
+
+	return true;
+}
+
 static int
 validate_shader_rec(struct drm_device *dev,
 		    struct exec_info *exec,
-		    void *validated,
-		    void *unvalidated,
-		    uint32_t len,
 		    struct vc4_shader_state *state)
 {
-	uint32_t *src_handles = unvalidated;
-	void *src_pkt;
-	void *dst_pkt = validated;
-	static const int gl_bo_offsets[] = {
-		4, 8, /* fs code, ubo */
-		16, 20, /* vs code, ubo */
-		28, 32, /* cs code, ubo */
+	uint32_t *src_handles;
+	void *pkt_u, *pkt_v;
+	enum shader_rec_reloc_type {
+		RELOC_CODE,
+		RELOC_VBO,
+	};
+	struct shader_rec_reloc {
+		enum shader_rec_reloc_type type;
+		uint32_t offset;
 	};
-	static const int nv_bo_offsets[] = {
-		4, 8, /* fs code, ubo */
-		12, /* vbo */
+	static const struct shader_rec_reloc gl_relocs[] = {
+		{ RELOC_CODE, 4 },  /* fs */
+		{ RELOC_CODE, 16 }, /* vs */
+		{ RELOC_CODE, 28 }, /* cs */
 	};
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_bo_offsets) + 8];
-	const int *bo_offsets;
-	uint32_t nr_attributes = 0, nr_bo, packet_size;
+	static const struct shader_rec_reloc nv_relocs[] = {
+		{ RELOC_CODE, 4 }, /* fs */
+		{ RELOC_VBO, 12 }
+	};
+	const struct shader_rec_reloc *relocs;
+	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
+	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
 	int i;
+	struct vc4_validated_shader_info *validated_shader = NULL;
 
 	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		bo_offsets = nv_bo_offsets;
-		nr_bo = ARRAY_SIZE(nv_bo_offsets);
+		relocs = nv_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
 
 		packet_size = 16;
 	} else {
-		bo_offsets = gl_bo_offsets;
-		nr_bo = ARRAY_SIZE(gl_bo_offsets);
+		relocs = gl_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
 
 		nr_attributes = state->addr & 0x7;
 		if (nr_attributes == 0)
 			nr_attributes = 8;
-		packet_size = 36 + nr_attributes * 8;
+		packet_size = gl_shader_rec_size(state->addr);
+	}
+	nr_relocs = nr_fixed_relocs + nr_attributes;
+
+	if (nr_relocs * 4 > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs reading %d handles "
+			  "from %d bytes left\n",
+			  nr_relocs, exec->shader_rec_size);
+		return -EINVAL;
 	}
-	if ((nr_bo + nr_attributes) * 4 + packet_size > len) {
-		DRM_ERROR("overflowed shader packet read "
-			  "(handles %d, packet %d, len %d)\n",
-			  (nr_bo + nr_attributes) * 4, packet_size, len);
+	src_handles = exec->shader_rec_u;
+	exec->shader_rec_u += nr_relocs * 4;
+	exec->shader_rec_size -= nr_relocs * 4;
+
+	if (packet_size > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs copying %db packet "
+			  "from %d bytes left\n",
+			  packet_size, exec->shader_rec_size);
 		return -EINVAL;
 	}
+	pkt_u = exec->shader_rec_u;
+	pkt_v = exec->shader_rec_v;
+	memcpy(pkt_v, pkt_u, packet_size);
+	exec->shader_rec_u += packet_size;
+	/* Shader recs have to be aligned to 16 bytes (due to the attribute
+	 * flags being in the low bytes), so round the next validated shader
+	 * rec address up.  This should be safe, since we've got so many
+	 * relocations in a shader rec packet.
+	 */
+	BUG_ON(roundup(packet_size, 16) - packet_size > nr_relocs * 4);
+	exec->shader_rec_v += roundup(packet_size, 16);
+	exec->shader_rec_size -= packet_size;
 
-	src_pkt = unvalidated + 4 * (nr_bo + nr_attributes);
-	memcpy(dst_pkt, src_pkt, packet_size);
+	for (i = 0; i < nr_relocs; i++) {
+		enum vc4_bo_mode mode;
 
-	for (i = 0; i < nr_bo + nr_attributes; i++) {
-		if (src_handles[i] >= exec->bo_count) {
-			DRM_ERROR("shader rec bo index %d > %d\n",
-				  src_handles[i], exec->bo_count);
-			return -EINVAL;
+		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
+			mode = VC4_MODE_SHADER;
+		else
+			mode = VC4_MODE_RENDER;
+
+		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i])) {
+			return false;
 		}
-		bo[i] = exec->bo[src_handles[i]];
 	}
 
-	for (i = 0; i < nr_bo; i++) {
-		/* XXX: validation */
-		uint32_t o = bo_offsets[i];
-		*(uint32_t *)(dst_pkt + o) =
-			bo[i]->paddr + *(uint32_t *)(src_pkt + o);
+	for (i = 0; i < nr_fixed_relocs; i++) {
+		uint32_t o = relocs[i].offset;
+		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
+		uint32_t *texture_handles_u;
+		void *uniform_data_u;
+		uint32_t tex;
+
+		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
+
+		switch (relocs[i].type) {
+		case RELOC_CODE:
+			kfree(validated_shader);
+			validated_shader = vc4_validate_shader(bo[i],
+							       src_offset);
+			if (!validated_shader)
+				goto fail;
+
+			if (validated_shader->uniforms_src_size >
+			    exec->uniforms_size) {
+				DRM_ERROR("Uniforms src buffer overflow\n");
+				goto fail;
+			}
+
+			texture_handles_u = exec->uniforms_u;
+			uniform_data_u = (texture_handles_u +
+					  validated_shader->num_texture_samples);
+
+			memcpy(exec->uniforms_v, uniform_data_u,
+			       validated_shader->uniforms_size);
+
+			for (tex = 0;
+			     tex < validated_shader->num_texture_samples;
+			     tex++) {
+				if (!reloc_tex(exec,
+					       uniform_data_u,
+					       &validated_shader->texture_samples[tex],
+					       texture_handles_u[tex])) {
+					goto fail;
+				}
+			}
+
+			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+
+			exec->uniforms_u += validated_shader->uniforms_src_size;
+			exec->uniforms_v += validated_shader->uniforms_size;
+			exec->uniforms_p += validated_shader->uniforms_size;
+
+			break;
+
+		case RELOC_VBO:
+			break;
+		}
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
-		/* XXX: validation */
+		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
 		uint32_t o = 36 + i * 8;
-		*(uint32_t *)(dst_pkt + o) =
-			bo[nr_bo + i]->paddr + *(uint32_t *)(src_pkt + o);
+		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
+		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
+		uint32_t stride = *(uint8_t *)(pkt_u + o + 5);
+		uint32_t max_index;
+
+		if (state->addr & 0x8)
+			stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;
+
+		if (vbo->base.size < offset ||
+		    vbo->base.size - offset < attr_size) {
+			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
+				  offset, attr_size, vbo->base.size);
+			return -EINVAL;
+		}
+
+		if (stride != 0) {
+			max_index = ((vbo->base.size - offset - attr_size) /
+				     stride);
+			if (state->max_index > max_index) {
+				DRM_ERROR("primitives use index %d out of supplied %d\n",
+					  state->max_index, max_index);
+				return -EINVAL;
+			}
+		}
+
+		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
 	}
 
+	kfree(validated_shader);
+
 	return 0;
+
+fail:
+	kfree(validated_shader);
+	return -EINVAL;
 }
 
 int
 vc4_validate_shader_recs(struct drm_device *dev,
-			 void *validated,
-			 void *unvalidated,
-			 uint32_t len,
 			 struct exec_info *exec)
 {
-	uint32_t dst_offset = 0;
-	uint32_t src_offset = 0;
 	uint32_t i;
 	int ret = 0;
 
 	for (i = 0; i < exec->shader_state_count; i++) {
-		if ((exec->shader_state[i].addr & ~0xf) !=
-		    (validated - exec->exec_bo->vaddr -
-		     (exec->shader_paddr - exec->exec_bo->paddr))) {
-			DRM_ERROR("unexpected shader rec offset: "
-				  "0x%08x vs 0x%08x\n",
-				  exec->shader_state[i].addr & ~0xf,
-				  (int)(validated -
-                                        exec->exec_bo->vaddr -
-                                        (exec->shader_paddr -
-                                         exec->exec_bo->paddr)));
-			return -EINVAL;
-		}
-
-		ret = validate_shader_rec(dev, exec,
-					  validated + dst_offset,
-					  unvalidated + src_offset,
-					  len - src_offset,
-					  &exec->shader_state[i]);
+		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
 		if (ret)
 			return ret;
-		/* XXX: incr dst/src offset */
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
new file mode 100644
index 000000000000..c53807603a51
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Shader validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory.  So, a user with access
+ * to execute shaders could escalate privilege by overwriting system memory
+ * (using the VPM write address register in the general-purpose DMA mode) or
+ * reading system memory it shouldn't (reading it as a texture, or uniform
+ * data, or vertex data).
+ *
+ * This walks over a shader starting from some offset within a BO, ensuring
+ * that its accesses are appropriately bounded, and recording how many texture
+ * accesses are made and where so that we can do relocations for them in the
+ * uniform stream.
+ *
+ * The kernel API has shaders stored in user-mapped BOs.  The BOs will be
+ * forcibly unmapped from the process before validation, and any cache of
+ * validated state will be flushed if the mapping is faulted back in.
+ *
+ * Storing the shaders in BOs means that the validation process will be slow
+ * due to uncached reads, but since shaders are long-lived and shader BOs are
+ * never actually modified, this shouldn't be a problem.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_qpu_defines.h"
+
+struct vc4_shader_validation_state {
+	struct vc4_texture_sample_info tmu_setup[2];
+	int tmu_write_count[2];
+};
+
+static bool
+is_tmu_write(uint32_t waddr)
+{
+	return (waddr >= QPU_W_TMU0_S &&
+		waddr <= QPU_W_TMU1_B);
+}
+
+static bool
+record_validated_texture_sample(struct vc4_validated_shader_info *validated_shader,
+				struct vc4_shader_validation_state *validation_state,
+				int tmu)
+{
+	uint32_t s = validated_shader->num_texture_samples;
+	int i;
+	struct vc4_texture_sample_info *temp_samples;
+
+	temp_samples = krealloc(validated_shader->texture_samples,
+				(s + 1) * sizeof(*temp_samples),
+				GFP_KERNEL);
+	if (!temp_samples)
+		return false;
+
+	memcpy(temp_samples[s].p_offset,
+	       validation_state->tmu_setup[tmu].p_offset,
+	       validation_state->tmu_write_count[tmu] * sizeof(uint32_t));
+	for (i = validation_state->tmu_write_count[tmu]; i < 4; i++)
+		temp_samples[s].p_offset[i] = ~0;
+
+	validated_shader->num_texture_samples = s + 1;
+	validated_shader->texture_samples = temp_samples;
+
+	return true;
+}
+
+static bool
+check_tmu_write(struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		uint32_t waddr)
+{
+	int tmu = waddr > QPU_W_TMU0_B;
+
+	if (!is_tmu_write(waddr))
+		return true;
+
+	if (validation_state->tmu_write_count[tmu] >= 4) {
+		DRM_ERROR("TMU%d got too many parameters before dispatch\n",
+			  tmu);
+		return false;
+	}
+	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
+		validated_shader->uniforms_size;
+	validation_state->tmu_write_count[tmu]++;
+	validated_shader->uniforms_size += 4;
+
+	if (waddr == QPU_W_TMU0_S || waddr == QPU_W_TMU1_S) {
+		if (!record_validated_texture_sample(validated_shader,
+						     validation_state, tmu)) {
+			return false;
+		}
+
+		validation_state->tmu_write_count[tmu] = 0;
+	}
+
+	return true;
+}
+
+static bool
+check_register_write(struct vc4_validated_shader_info *validated_shader,
+		     struct vc4_shader_validation_state *validation_state,
+		     uint32_t waddr)
+{
+	switch (waddr) {
+	case QPU_W_UNIFORMS_ADDRESS:
+		/* XXX: We'll probably need to support this for reladdr, but
+		 * it's definitely a security-related one.
+		 */
+		DRM_ERROR("uniforms address load unsupported\n");
+		return false;
+
+	case QPU_W_TLB_COLOR_MS:
+	case QPU_W_TLB_COLOR_ALL:
+	case QPU_W_TLB_Z:
+		/* These only interact with the tile buffer, not main memory,
+		 * so they're safe.
+		 */
+		return true;
+
+	case QPU_W_TMU0_S:
+	case QPU_W_TMU0_T:
+	case QPU_W_TMU0_R:
+	case QPU_W_TMU0_B:
+	case QPU_W_TMU1_S:
+	case QPU_W_TMU1_T:
+	case QPU_W_TMU1_R:
+	case QPU_W_TMU1_B:
+		return check_tmu_write(validated_shader, validation_state,
+				       waddr);
+
+	case QPU_W_HOST_INT:
+	case QPU_W_TMU_NOSWAP:
+	case QPU_W_TLB_STENCIL_SETUP:
+	case QPU_W_TLB_ALPHA_MASK:
+	case QPU_W_MUTEX_RELEASE:
+		/* XXX: I haven't thought about these, so don't support them
+		 * for now.
+		 */
+		DRM_ERROR("Unsupported waddr %d\n", waddr);
+		return false;
+
+	case QPU_W_VPM_ADDR:
+		DRM_ERROR("General VPM DMA unsupported\n");
+		return false;
+
+	case QPU_W_VPM:
+	case QPU_W_VPMVCD_SETUP:
+		/* We allow VPM setup in general, even including VPM DMA
+		 * configuration setup, because the (unsafe) DMA can only be
+		 * triggered by QPU_W_VPM_ADDR writes.
+		 */
+		return true;
+	}
+
+	return true;
+}
+
+static bool
+check_instruction_writes(uint64_t inst,
+			 struct vc4_validated_shader_info *validated_shader,
+			 struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
+		DRM_ERROR("ADD and MUL both set up textures\n");
+		return false;
+	}
+
+	return (check_register_write(validated_shader, validation_state, waddr_add) &&
+		check_register_write(validated_shader, validation_state, waddr_mul));
+}
+
+static bool
+check_instruction_reads(uint64_t inst,
+			struct vc4_validated_shader_info *validated_shader)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (raddr_a == QPU_R_UNIF ||
+	    raddr_b == QPU_R_UNIF) {
+		if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) {
+			DRM_ERROR("uniform read in the same instruction as "
+				  "texture setup");
+			return false;
+		}
+
+		/* This can't overflow the uint32_t, because we're reading 8
+		 * bytes of instruction to increment by 4 here, so we'd
+		 * already be OOM.
+		 */
+		validated_shader->uniforms_size += 4;
+	}
+
+	return true;
+}
+
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
+		    uint32_t start_offset)
+{
+	bool found_shader_end = false;
+	int shader_end_ip = 0;
+	uint32_t ip, max_ip;
+	uint64_t *shader;
+	struct vc4_validated_shader_info *validated_shader;
+	struct vc4_shader_validation_state validation_state;
+
+	memset(&validation_state, 0, sizeof(validation_state));
+
+	if (start_offset + sizeof(uint64_t) > shader_obj->base.size) {
+		DRM_ERROR("shader starting at %d outside of BO sized %d\n",
+			  start_offset,
+			  shader_obj->base.size);
+		return NULL;
+	}
+	shader = shader_obj->vaddr + start_offset;
+	max_ip = (shader_obj->base.size - start_offset) / sizeof(uint64_t);
+
+	validated_shader = kcalloc(sizeof(*validated_shader), 1, GFP_KERNEL);
+	if (!validated_shader)
+		return NULL;
+
+	for (ip = 0; ip < max_ip; ip++) {
+		uint64_t inst = shader[ip];
+		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+		switch (sig) {
+		case QPU_SIG_NONE:
+		case QPU_SIG_WAIT_FOR_SCOREBOARD:
+		case QPU_SIG_SCOREBOARD_UNLOCK:
+		case QPU_SIG_LOAD_TMU0:
+		case QPU_SIG_LOAD_TMU1:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad write at ip %d\n", ip);
+				goto fail;
+			}
+
+			if (!check_instruction_reads(inst, validated_shader))
+				goto fail;
+
+			break;
+
+		case QPU_SIG_LOAD_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
+				goto fail;
+			}
+			break;
+
+		case QPU_SIG_PROG_END:
+			found_shader_end = true;
+			shader_end_ip = ip;
+			break;
+
+		default:
+			DRM_ERROR("Unsupported QPU signal %d at "
+				  "instruction %d\n", sig, ip);
+			goto fail;
+		}
+
+		/* There are two delay slots after program end is signaled
+		 * that are still executed, then we're finished.
+		 */
+		if (found_shader_end && ip == shader_end_ip + 2)
+			break;
+	}
+
+	if (ip == max_ip) {
+		DRM_ERROR("shader starting at %d failed to terminate before "
+			  "shader BO end at %d\n",
+			  start_offset,
+			  shader_obj->base.size);
+		goto fail;
+	}
+
+	/* Again, no chance of integer overflow here because the worst case
+	 * scenario is 8 bytes of uniforms plus handles per 8-byte
+	 * instruction.
+	 */
+	validated_shader->uniforms_src_size =
+		(validated_shader->uniforms_size +
+		 4 * validated_shader->num_texture_samples);
+
+	return validated_shader;
+
+fail:
+	kfree(validated_shader);
+	return NULL;
+}
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 6cccad5f6c8b..fa9d2372b3e1 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -28,10 +28,11 @@
 
 #define DRM_VC4_SUBMIT_CL                         0x00
 
-#define DRM_IOCTL_VC4_SUBMIT_CL	   DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
+#define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 
 /**
- * Structure for submitting commands to the 3D engine.
+ * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
+ * engine.
  *
  * Drivers typically use GPU BOs to store batchbuffers / command lists and
  * their associated state.  However, because the VC4 lacks an MMU, we have to
@@ -43,8 +44,7 @@
  * BO.
  */
 struct drm_vc4_submit_cl {
-	/**
-	 * Pointer to the binner command list.
+	/* Pointer to the binner command list.
 	 *
 	 * This is the first set of commands executed, which runs the
 	 * coordinate shader to determine where primitives land on the screen,
@@ -53,8 +53,7 @@ struct drm_vc4_submit_cl {
 	 */
 	void __user *bin_cl;
 
-	/**
-	 * Pointer to the render command list.
+	/* Pointer to the render command list.
 	 *
 	 * The render command list contains a set of packets to load the
 	 * current tile's state (reading from memory, or just clearing it)
@@ -64,7 +63,7 @@ struct drm_vc4_submit_cl {
 	 */
 	void __user *render_cl;
 
-	/** Pointer to the shader records.
+	/* Pointer to the shader records.
 	 *
 	 * Shader records are the structures read by the hardware that contain
 	 * pointers to uniforms, shaders, and vertex attributes.  The
@@ -73,25 +72,42 @@ struct drm_vc4_submit_cl {
 	 * and an attribute count), so those BO indices into bo_handles are
 	 * just stored as uint32_ts before each shader record passed in.
 	 */
-	void __user *shader_records;
+	void __user *shader_rec;
+
+	/* Pointer to uniform data and texture handles for the textures
+	 * referenced by the shader.
+	 *
+	 * For each shader state record, there is a set of uniform data in the
+	 * order referenced by the record (FS, VS, then CS).  Each set of
+	 * uniform data has a uint32_t index into bo_handles per texture
+	 * sample operation, in the order the QPU_W_TMUn_S writes appear in
+	 * the program.  Following the texture BO handle indices is the actual
+	 * uniform data.
+	 *
+	 * The individual uniform state blocks don't have sizes passed in,
+	 * because the kernel has to determine the sizes anyway during shader
+	 * code validation.
+	 */
+	void __user *uniforms;
 	void __user *bo_handles;
 
-	/** Size in bytes of the binner command list. */
-	uint32_t bin_cl_len;
-	/** Size in bytes of the render command list */
-	uint32_t render_cl_len;
-	/** Size in bytes of the list of shader records. */
-	uint32_t shader_record_len;
-	/**
-	 * Number of shader records.
+	/* Size in bytes of the binner command list. */
+	uint32_t bin_cl_size;
+	/* Size in bytes of the render command list */
+	uint32_t render_cl_size;
+	/* Size in bytes of the set of shader records. */
+	uint32_t shader_rec_size;
+	/* Number of shader records.
 	 *
-	 * This could just be computed from the contents of shader_records,
-	 * but it keeps the kernel from having to resize various allocations
-	 * it makes.
+	 * This could just be computed from the contents of shader_records and
+	 * the address bits of references to them from the bin CL, but it
+	 * keeps the kernel from having to resize some allocations it makes.
 	 */
-	uint32_t shader_record_count;
+	uint32_t shader_rec_count;
+	/* Size in bytes of the uniform state. */
+	uint32_t uniforms_size;
 
-	/** Number of BO handles passed in (size is that times 4). */
+	/* Number of BO handles passed in (size is that times 4). */
 	uint32_t bo_handle_count;
 };
author	Eric Anholt <eric@anholt.net>	2014-08-05 10:04:10 -0700
committer	Eric Anholt <eric@anholt.net>	2015-06-04 14:15:17 -0700
commit	51bd38241bdc3cee795668518dea68a567df72c9 (patch)
tree	9f2cc9e6f12ea18f471e8e744601784bd510e53c
parent	d91801885a55146f4145233f4a14ddb6815541cb (diff)
download	linux-51bd38241bdc3cee795668518dea68a567df72c9.tar.gz