From 5ea1d4f213c2ae388684fa70c9b2f1a9eed46825 Mon Sep 17 00:00:00 2001
From: Robert Bragg <robert@linux.intel.com>
Date: Fri, 9 Nov 2012 19:03:06 +0000
Subject: stash: Adds outline of drm driver based on SNA code

---
 cogl/Makefile.am                                   |   44 +
 cogl/cogl-private.h                                |    5 +
 cogl/cogl-renderer.c                               |   27 +
 cogl/cogl-renderer.h                               |   13 +-
 cogl/driver/drm/brw/brw.h                          |   17 +
 cogl/driver/drm/brw/brw_disasm.c                   | 1104 +++++
 cogl/driver/drm/brw/brw_eu.c                       |  150 +
 cogl/driver/drm/brw/brw_eu.h                       | 2266 +++++++++
 cogl/driver/drm/brw/brw_eu_debug.c                 |   95 +
 cogl/driver/drm/brw/brw_eu_emit.c                  | 2002 ++++++++
 cogl/driver/drm/brw/brw_eu_util.c                  |  126 +
 cogl/driver/drm/brw/brw_sf.c                       |   54 +
 cogl/driver/drm/brw/brw_test.c                     |   60 +
 cogl/driver/drm/brw/brw_test.h                     |   46 +
 cogl/driver/drm/brw/brw_test_gen4.c                |  199 +
 cogl/driver/drm/brw/brw_test_gen5.c                |  208 +
 cogl/driver/drm/brw/brw_test_gen6.c                |  209 +
 cogl/driver/drm/brw/brw_test_gen7.c                |  191 +
 cogl/driver/drm/brw/brw_wm.c                       |  681 +++
 cogl/driver/drm/cogl-attribute-drm-private.h       |   42 +
 cogl/driver/drm/cogl-attribute-drm.c               |   43 +
 cogl/driver/drm/cogl-clip-stack-drm-private.h      |   38 +
 cogl/driver/drm/cogl-clip-stack-drm.c              |   37 +
 cogl/driver/drm/cogl-driver-drm.c                  |   82 +
 cogl/driver/drm/cogl-framebuffer-drm-private.h     |   97 +
 cogl/driver/drm/cogl-framebuffer-drm.c             |  121 +
 cogl/driver/drm/cogl-texture-2d-drm-private.h      |  118 +
 cogl/driver/drm/cogl-texture-2d-drm.c              |  167 +
 cogl/driver/drm/compiler.h                         |   59 +
 cogl/driver/drm/intel_list.h                       |  408 ++
 cogl/driver/drm/kgem.c                             | 5182 ++++++++++++++++++++
 cogl/driver/drm/kgem.h                             |  620 +++
 cogl/driver/drm/kgem_debug.c                       |  424 ++
 cogl/driver/drm/kgem_debug.h                       |   34 +
 cogl/driver/drm/kgem_debug_gen2.c                  |  687 +++
 cogl/driver/drm/kgem_debug_gen3.c                  | 1600 ++++++
 cogl/driver/drm/kgem_debug_gen4.c                  |  688 +++
 cogl/driver/drm/kgem_debug_gen5.c                  |  664 +++
 cogl/driver/drm/kgem_debug_gen6.c                  | 1075 ++++
 cogl/driver/drm/kgem_debug_gen7.c                  |  716 +++
 cogl/driver/drm/render_program/exa_sf.g4b          |   15 +
 cogl/driver/drm/render_program/exa_sf.g5b          |    7 +
 cogl/driver/drm/render_program/exa_sf_mask.g4b     |   15 +
 cogl/driver/drm/render_program/exa_sf_mask.g5b     |    7 +
 cogl/driver/drm/render_program/exa_wm_ca.g4b       |    4 +
 cogl/driver/drm/render_program/exa_wm_ca.g5b       |    4 +
 cogl/driver/drm/render_program/exa_wm_ca.g6b       |    4 +
 .../drm/render_program/exa_wm_ca_srcalpha.g4b      |    4 +
 .../drm/render_program/exa_wm_ca_srcalpha.g5b      |    4 +
 .../drm/render_program/exa_wm_ca_srcalpha.g6b      |    4 +
 .../drm/render_program/exa_wm_mask_affine.g4b      |    8 +
 .../drm/render_program/exa_wm_mask_affine.g5b      |    4 +
 .../drm/render_program/exa_wm_mask_affine.g6b      |    4 +
 .../drm/render_program/exa_wm_mask_affine.g7b      |    4 +
 .../drm/render_program/exa_wm_mask_projective.g4b  |   16 +
 .../drm/render_program/exa_wm_mask_projective.g5b  |   16 +
 .../drm/render_program/exa_wm_mask_projective.g6b  |   12 +
 .../drm/render_program/exa_wm_mask_projective.g7b  |   12 +
 .../drm/render_program/exa_wm_mask_sample_a.g4b    |    3 +
 .../drm/render_program/exa_wm_mask_sample_a.g5b    |    3 +
 .../drm/render_program/exa_wm_mask_sample_a.g6b    |    3 +
 .../drm/render_program/exa_wm_mask_sample_a.g7b    |    3 +
 .../drm/render_program/exa_wm_mask_sample_argb.g4b |    3 +
 .../drm/render_program/exa_wm_mask_sample_argb.g5b |    3 +
 .../drm/render_program/exa_wm_mask_sample_argb.g6b |    3 +
 .../drm/render_program/exa_wm_mask_sample_argb.g7b |    3 +
 cogl/driver/drm/render_program/exa_wm_noca.g4b     |    4 +
 cogl/driver/drm/render_program/exa_wm_noca.g5b     |    4 +
 cogl/driver/drm/render_program/exa_wm_noca.g6b     |    4 +
 .../drm/render_program/exa_wm_src_affine.g4b       |    8 +
 .../drm/render_program/exa_wm_src_affine.g5b       |    4 +
 .../drm/render_program/exa_wm_src_affine.g6b       |    4 +
 .../drm/render_program/exa_wm_src_affine.g7b       |    4 +
 .../drm/render_program/exa_wm_src_projective.g4b   |   16 +
 .../drm/render_program/exa_wm_src_projective.g5b   |   16 +
 .../drm/render_program/exa_wm_src_projective.g6b   |   12 +
 .../drm/render_program/exa_wm_src_projective.g7b   |   12 +
 .../drm/render_program/exa_wm_src_sample_a.g4b     |    3 +
 .../drm/render_program/exa_wm_src_sample_a.g5b     |    3 +
 .../drm/render_program/exa_wm_src_sample_a.g6b     |    3 +
 .../drm/render_program/exa_wm_src_sample_a.g7b     |    3 +
 .../drm/render_program/exa_wm_src_sample_argb.g4b  |    3 +
 .../drm/render_program/exa_wm_src_sample_argb.g5b  |    2 +
 .../drm/render_program/exa_wm_src_sample_argb.g6b  |    3 +
 .../drm/render_program/exa_wm_src_sample_argb.g7b  |    3 +
 .../render_program/exa_wm_src_sample_planar.g4b    |    5 +
 .../render_program/exa_wm_src_sample_planar.g5b    |    5 +
 .../render_program/exa_wm_src_sample_planar.g6b    |    5 +
 .../render_program/exa_wm_src_sample_planar.g7b    |    5 +
 cogl/driver/drm/render_program/exa_wm_write.g4b    |   18 +
 cogl/driver/drm/render_program/exa_wm_write.g5b    |    6 +
 cogl/driver/drm/render_program/exa_wm_write.g6b    |   17 +
 cogl/driver/drm/render_program/exa_wm_write.g7b    |   17 +
 cogl/driver/drm/render_program/exa_wm_xy.g4b       |    4 +
 cogl/driver/drm/render_program/exa_wm_xy.g5b       |    4 +
 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g4b  |   12 +
 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g5b  |   12 +
 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g6b  |   12 +
 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g7b  |   12 +
 cogl/driver/drm/sna.h                              |  829 ++++
 cogl/driver/drm/sna_reg.h                          |   82 +
 cogl/driver/drm/sna_render.h                       |  720 +++
 cogl/winsys/cogl-winsys-drm-private.h              |   30 +
 cogl/winsys/cogl-winsys-drm.c                      |  358 ++
 configure.ac                                       |   18 +
 examples/cogl-info.c                               |    4 +-
 106 files changed, 22849 insertions(+), 4 deletions(-)
 create mode 100644 cogl/driver/drm/brw/brw.h
 create mode 100644 cogl/driver/drm/brw/brw_disasm.c
 create mode 100644 cogl/driver/drm/brw/brw_eu.c
 create mode 100644 cogl/driver/drm/brw/brw_eu.h
 create mode 100644 cogl/driver/drm/brw/brw_eu_debug.c
 create mode 100644 cogl/driver/drm/brw/brw_eu_emit.c
 create mode 100644 cogl/driver/drm/brw/brw_eu_util.c
 create mode 100644 cogl/driver/drm/brw/brw_sf.c
 create mode 100644 cogl/driver/drm/brw/brw_test.c
 create mode 100644 cogl/driver/drm/brw/brw_test.h
 create mode 100644 cogl/driver/drm/brw/brw_test_gen4.c
 create mode 100644 cogl/driver/drm/brw/brw_test_gen5.c
 create mode 100644 cogl/driver/drm/brw/brw_test_gen6.c
 create mode 100644 cogl/driver/drm/brw/brw_test_gen7.c
 create mode 100644 cogl/driver/drm/brw/brw_wm.c
 create mode 100644 cogl/driver/drm/cogl-attribute-drm-private.h
 create mode 100644 cogl/driver/drm/cogl-attribute-drm.c
 create mode 100644 cogl/driver/drm/cogl-clip-stack-drm-private.h
 create mode 100644 cogl/driver/drm/cogl-clip-stack-drm.c
 create mode 100644 cogl/driver/drm/cogl-driver-drm.c
 create mode 100644 cogl/driver/drm/cogl-framebuffer-drm-private.h
 create mode 100644 cogl/driver/drm/cogl-framebuffer-drm.c
 create mode 100644 cogl/driver/drm/cogl-texture-2d-drm-private.h
 create mode 100644 cogl/driver/drm/cogl-texture-2d-drm.c
 create mode 100644 cogl/driver/drm/compiler.h
 create mode 100644 cogl/driver/drm/intel_list.h
 create mode 100644 cogl/driver/drm/kgem.c
 create mode 100644 cogl/driver/drm/kgem.h
 create mode 100644 cogl/driver/drm/kgem_debug.c
 create mode 100644 cogl/driver/drm/kgem_debug.h
 create mode 100644 cogl/driver/drm/kgem_debug_gen2.c
 create mode 100644 cogl/driver/drm/kgem_debug_gen3.c
 create mode 100644 cogl/driver/drm/kgem_debug_gen4.c
 create mode 100644 cogl/driver/drm/kgem_debug_gen5.c
 create mode 100644 cogl/driver/drm/kgem_debug_gen6.c
 create mode 100644 cogl/driver/drm/kgem_debug_gen7.c
 create mode 100644 cogl/driver/drm/render_program/exa_sf.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_sf.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_sf_mask.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_sf_mask.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_affine.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_affine.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_affine.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_affine.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_projective.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_projective.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_projective.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_projective.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_a.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_a.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_a.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_a.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_noca.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_noca.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_noca.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_affine.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_affine.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_affine.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_affine.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_projective.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_projective.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_projective.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_projective.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_a.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_a.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_a.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_a.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_argb.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_argb.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_argb.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_argb.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_planar.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_planar.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_planar.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_src_sample_planar.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_write.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_write.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_write.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_write.g7b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_xy.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_xy.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g4b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g5b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g6b
 create mode 100644 cogl/driver/drm/render_program/exa_wm_yuv_rgb.g7b
 create mode 100644 cogl/driver/drm/sna.h
 create mode 100644 cogl/driver/drm/sna_reg.h
 create mode 100644 cogl/driver/drm/sna_render.h
 create mode 100644 cogl/winsys/cogl-winsys-drm-private.h
 create mode 100644 cogl/winsys/cogl-winsys-drm.c

diff --git a/cogl/Makefile.am b/cogl/Makefile.am
index 6b207ffe..dfcaada1 100644
--- a/cogl/Makefile.am
+++ b/cogl/Makefile.am
@@ -130,6 +130,45 @@ cogl_driver_sources = \
 	$(srcdir)/driver/nop/cogl-texture-2d-nop.c \
 	$(NULL)
 
+if SUPPORT_DRM
+# drm driver
+INCLUDES += -I$(srcdir)/driver/drm/render_program
+cogl_driver_sources += \
+	$(srcdir)/driver/drm/cogl-driver-drm.c \
+	$(srcdir)/driver/drm/cogl-framebuffer-drm-private.h \
+	$(srcdir)/driver/drm/cogl-framebuffer-drm.c \
+	$(srcdir)/driver/drm/cogl-attribute-drm-private.h \
+	$(srcdir)/driver/drm/cogl-attribute-drm.c \
+	$(srcdir)/driver/drm/cogl-clip-stack-drm-private.h \
+	$(srcdir)/driver/drm/cogl-clip-stack-drm.c \
+	$(srcdir)/driver/drm/cogl-texture-2d-drm-private.h \
+	$(srcdir)/driver/drm/cogl-texture-2d-drm.c \
+	$(srcdir)/driver/drm/brw/brw_test.h \
+	$(srcdir)/driver/drm/brw/brw_test.c \
+	$(srcdir)/driver/drm/brw/brw_test_gen4.c \
+	$(srcdir)/driver/drm/brw/brw_test_gen5.c \
+	$(srcdir)/driver/drm/brw/brw_test_gen6.c \
+	$(srcdir)/driver/drm/brw/brw_test_gen7.c \
+	$(srcdir)/driver/drm/brw/brw.h \
+	$(srcdir)/driver/drm/brw/brw_disasm.c \
+	$(srcdir)/driver/drm/brw/brw_eu.h \
+	$(srcdir)/driver/drm/brw/brw_eu.c \
+	$(srcdir)/driver/drm/brw/brw_eu_emit.c \
+	$(srcdir)/driver/drm/brw/brw_sf.c \
+	$(srcdir)/driver/drm/brw/brw_wm.c \
+        $(srcdir)/driver/drm/kgem.c \
+        $(srcdir)/driver/drm/kgem.h \
+        $(srcdir)/driver/drm/kgem_debug.c \
+        $(srcdir)/driver/drm/kgem_debug.h \
+        $(srcdir)/driver/drm/kgem_debug_gen2.c \
+        $(srcdir)/driver/drm/kgem_debug_gen3.c \
+        $(srcdir)/driver/drm/kgem_debug_gen4.c \
+        $(srcdir)/driver/drm/kgem_debug_gen5.c \
+        $(srcdir)/driver/drm/kgem_debug_gen6.c \
+        $(srcdir)/driver/drm/kgem_debug_gen7.c \
+	$(NULL)
+endif
+
 # gl driver sources
 cogl_gl_prototypes_h = \
 	$(srcdir)/gl-prototypes/cogl-gles2-functions.h		\
@@ -502,6 +541,11 @@ cogl_sources_c += \
        $(srcdir)/winsys/cogl-winsys-sdl2.c \
        $(srcdir)/cogl-sdl.c
 endif
+if SUPPORT_DRM
+cogl_sources_c += \
+       $(srcdir)/winsys/cogl-winsys-drm-private.h \
+       $(srcdir)/winsys/cogl-winsys-drm.c
+endif
 
 EXTRA_DIST += stb_image.c
 
diff --git a/cogl/cogl-private.h b/cogl/cogl-private.h
index ca508a4e..81bc6b54 100644
--- a/cogl/cogl-private.h
+++ b/cogl/cogl-private.h
@@ -137,6 +137,11 @@ _cogl_pixel_format_is_endian_dependant (CoglPixelFormat format);
 #define COGL_PIXEL_FORMAT_CAN_HAVE_PREMULT(format) \
   (((format) & COGL_A_BIT) && (format) != COGL_PIXEL_FORMAT_A_8)
 
+typedef struct
+{
+  short x0, y0, x1, y1;
+} BoxRec;
+
 COGL_END_DECLS
 
 #endif /* __COGL_PRIVATE_H__ */
diff --git a/cogl/cogl-renderer.c b/cogl/cogl-renderer.c
index 04ffc76a..453e0865 100644
--- a/cogl/cogl-renderer.c
+++ b/cogl/cogl-renderer.c
@@ -72,6 +72,9 @@
 #ifdef COGL_HAS_SDL_SUPPORT
 #include "cogl-winsys-sdl-private.h"
 #endif
+#ifdef COGL_HAS_DRM_SUPPORT
+#include "cogl-winsys-drm-private.h"
+#endif
 
 #if COGL_HAS_XLIB_SUPPORT
 #include "cogl-xlib-renderer.h"
@@ -87,6 +90,9 @@ extern const CoglDriverVtable _cogl_driver_gl;
 extern const CoglTextureDriver _cogl_texture_driver_gles;
 extern const CoglDriverVtable _cogl_driver_gles;
 #endif
+#if defined (HAVE_COGL_DRM)
+extern const CoglDriverVtable _cogl_driver_drm;
+#endif
 
 extern const CoglDriverVtable _cogl_driver_nop;
 
@@ -118,6 +124,9 @@ static CoglWinsysVtableGetter _cogl_winsys_vtable_getters[] =
 #endif
 #ifdef COGL_HAS_SDL_SUPPORT
   _cogl_winsys_sdl_get_vtable,
+#endif
+#ifdef COGL_HAS_SDL_SUPPORT
+  _cogl_winsys_drm_get_vtable,
 #endif
   _cogl_winsys_stub_get_vtable,
 };
@@ -319,6 +328,17 @@ _cogl_renderer_choose_driver (CoglRenderer *renderer,
     }
 #endif
 
+#ifdef HAVE_COGL_DRM
+  if (renderer->driver_override == COGL_DRIVER_DRM ||
+      (renderer->driver_override == COGL_DRIVER_ANY &&
+       (driver_name == NULL || !g_ascii_strcasecmp (driver_name, "drm"))))
+    {
+      renderer->driver = COGL_DRIVER_DRM;
+      libgl_name = NULL;
+      goto found;
+    }
+#endif
+
   if (renderer->driver_override == COGL_DRIVER_NOP ||
       (renderer->driver_override == COGL_DRIVER_ANY &&
        (driver_name == NULL || !g_ascii_strcasecmp (driver_name, "nop"))))
@@ -386,6 +406,13 @@ found:
       break;
 #endif
 
+#if defined (HAVE_COGL_DRM)
+    case COGL_DRIVER_DRM:
+      renderer->driver_vtable = &_cogl_driver_drm;
+      renderer->texture_driver = NULL;
+      break;
+#endif
+
     case COGL_DRIVER_NOP:
     default:
       renderer->driver_vtable = &_cogl_driver_nop;
diff --git a/cogl/cogl-renderer.h b/cogl/cogl-renderer.h
index 45965195..c037f1f6 100644
--- a/cogl/cogl-renderer.h
+++ b/cogl/cogl-renderer.h
@@ -149,6 +149,7 @@ cogl_renderer_new (void);
  * @COGL_WINSYS_ID_EGL_ANDROID: Use EGL with the Android platform
  * @COGL_WINSYS_ID_WGL: Use the Microsoft Windows WGL binding API
  * @COGL_WINSYS_ID_SDL: Use the SDL window system
+ * @COGL_WINSYS_ID_DRM: Use the Linux DRM interfaces directly
  *
  * Identifies specific window system backends that Cogl supports.
  *
@@ -167,7 +168,8 @@ typedef enum
   COGL_WINSYS_ID_EGL_KMS,
   COGL_WINSYS_ID_EGL_ANDROID,
   COGL_WINSYS_ID_WGL,
-  COGL_WINSYS_ID_SDL
+  COGL_WINSYS_ID_SDL,
+  COGL_WINSYS_ID_DRM
 } CoglWinsysID;
 
 /**
@@ -264,6 +266,8 @@ cogl_renderer_connect (CoglRenderer *renderer, CoglError **error);
  *    renderer supports creating a #CoglGLES2Context via
  *    cogl_gles2_context_new(). This can be used to integrate GLES 2.0
  *    code into Cogl based applications.
+ * @COGL_RENDERER_CONSTRAINT_USES_GL: Required renderer depends
+ *    on OpenGL[ES]
  *
  * These constraint flags are hard-coded features of the different renderer
  * backends. Sometimes a platform may support multiple rendering options which
@@ -285,7 +289,8 @@ typedef enum
   COGL_RENDERER_CONSTRAINT_USES_X11 = (1 << 0),
   COGL_RENDERER_CONSTRAINT_USES_XLIB = (1 << 1),
   COGL_RENDERER_CONSTRAINT_USES_EGL = (1 << 2),
-  COGL_RENDERER_CONSTRAINT_SUPPORTS_COGL_GLES2 = (1 << 3)
+  COGL_RENDERER_CONSTRAINT_SUPPORTS_COGL_GLES2 = (1 << 3),
+  COGL_RENDERER_CONSTRAINT_USES_GL = (1 << 4)
 } CoglRendererConstraint;
 
 
@@ -331,6 +336,7 @@ cogl_renderer_remove_constraint (CoglRenderer *renderer,
  * @COGL_DRIVER_GL3: An OpenGL driver using the core GL 3.1 profile
  * @COGL_DRIVER_GLES1: An OpenGL ES 1.1 driver.
  * @COGL_DRIVER_GLES2: An OpenGL ES 2.0 driver.
+ * @COGL_DRIVER_DRM: An DRM driver.
  *
  * Identifiers for underlying hardware drivers that may be used by
  * Cogl for rendering.
@@ -345,7 +351,8 @@ typedef enum
   COGL_DRIVER_GL,
   COGL_DRIVER_GL3,
   COGL_DRIVER_GLES1,
-  COGL_DRIVER_GLES2
+  COGL_DRIVER_GLES2,
+  COGL_DRIVER_DRM
 } CoglDriver;
 
 /**
diff --git a/cogl/driver/drm/brw/brw.h b/cogl/driver/drm/brw/brw.h
new file mode 100644
index 00000000..e5fa72f9
--- /dev/null
+++ b/cogl/driver/drm/brw/brw.h
@@ -0,0 +1,17 @@
+#include "brw_eu.h"
+
+bool brw_sf_kernel__nomask(struct brw_compile *p);
+bool brw_sf_kernel__mask(struct brw_compile *p);
+
+bool brw_wm_kernel__affine(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__affine_mask(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__affine_mask_ca(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__affine_mask_sa(struct brw_compile *p, int dispatch_width);
+
+bool brw_wm_kernel__projective(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__projective_mask(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__projective_mask_ca(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__projective_mask_sa(struct brw_compile *p, int dispatch_width);
+
+bool brw_wm_kernel__affine_opacity(struct brw_compile *p, int dispatch_width);
+bool brw_wm_kernel__projective_opacity(struct brw_compile *p, int dispatch_width);
diff --git a/cogl/driver/drm/brw/brw_disasm.c b/cogl/driver/drm/brw/brw_disasm.c
new file mode 100644
index 00000000..e6da1745
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_disasm.c
@@ -0,0 +1,1104 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "brw_eu.h"
+
+static const struct {
+	const char *name;
+	int nsrc;
+	int ndst;
+} opcode[128] = {
+	[BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+	[BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+	[BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+	[BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+	[BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+	[BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+	[BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+	[BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+	[BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+	[BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+	[BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+	[BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+	[BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+	[BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+	[BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+	[BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+	[BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static const char *conditional_modifier[16] = {
+	[BRW_CONDITIONAL_NONE] = "",
+	[BRW_CONDITIONAL_Z] = ".e",
+	[BRW_CONDITIONAL_NZ] = ".ne",
+	[BRW_CONDITIONAL_G] = ".g",
+	[BRW_CONDITIONAL_GE] = ".ge",
+	[BRW_CONDITIONAL_L] = ".l",
+	[BRW_CONDITIONAL_LE] = ".le",
+	[BRW_CONDITIONAL_R] = ".r",
+	[BRW_CONDITIONAL_O] = ".o",
+	[BRW_CONDITIONAL_U] = ".u",
+};
+
+static const char *negate[2] = {
+	[0] = "",
+	[1] = "-",
+};
+
+static const char *_abs[2] = {
+	[0] = "",
+	[1] = "(abs)",
+};
+
+static const char *vert_stride[16] = {
+	[0] = "0",
+	[1] = "1",
+	[2] = "2",
+	[3] = "4",
+	[4] = "8",
+	[5] = "16",
+	[6] = "32",
+	[15] = "VxH",
+};
+
+static const char *width[8] = {
+	[0] = "1",
+	[1] = "2",
+	[2] = "4",
+	[3] = "8",
+	[4] = "16",
+};
+
+static const char *horiz_stride[4] = {
+	[0] = "0",
+	[1] = "1",
+	[2] = "2",
+	[3] = "4"
+};
+
+static const char *chan_sel[4] = {
+	[0] = "x",
+	[1] = "y",
+	[2] = "z",
+	[3] = "w",
+};
+
+#if 0
+static const char *dest_condmod[16] = {
+};
+
+static const char *imm_encoding[8] = {
+	[0] = "UD",
+	[1] = "D",
+	[2] = "UW",
+	[3] = "W",
+	[5] = "VF",
+	[6] = "V",
+	[7] = "F"
+};
+#endif
+
+static const char *debug_ctrl[2] = {
+	[0] = "",
+	[1] = ".breakpoint"
+};
+
+static const char *saturate[2] = {
+	[0] = "",
+	[1] = ".sat"
+};
+
+static const char *accwr[2] = {
+	[0] = "",
+	[1] = "AccWrEnable"
+};
+
+static const char *wectrl[2] = {
+	[0] = "WE_normal",
+	[1] = "WE_all"
+};
+
+static const char *exec_size[8] = {
+	[0] = "1",
+	[1] = "2",
+	[2] = "4",
+	[3] = "8",
+	[4] = "16",
+	[5] = "32"
+};
+
+static const char *pred_inv[2] = {
+	[0] = "+",
+	[1] = "-"
+};
+
+static const char *pred_ctrl_align16[16] = {
+	[1] = "",
+	[2] = ".x",
+	[3] = ".y",
+	[4] = ".z",
+	[5] = ".w",
+	[6] = ".any4h",
+	[7] = ".all4h",
+};
+
+static const char *pred_ctrl_align1[16] = {
+	[1] = "",
+	[2] = ".anyv",
+	[3] = ".allv",
+	[4] = ".any2h",
+	[5] = ".all2h",
+	[6] = ".any4h",
+	[7] = ".all4h",
+	[8] = ".any8h",
+	[9] = ".all8h",
+	[10] = ".any16h",
+	[11] = ".all16h",
+};
+
+static const char *thread_ctrl[4] = {
+	[0] = "",
+	[2] = "switch"
+};
+
+static const char *compr_ctrl[4] = {
+	[0] = "",
+	[1] = "sechalf",
+	[2] = "compr",
+	[3] = "compr4",
+};
+
+static const char *dep_ctrl[4] = {
+	[0] = "",
+	[1] = "NoDDClr",
+	[2] = "NoDDChk",
+	[3] = "NoDDClr,NoDDChk",
+};
+
+static const char *mask_ctrl[4] = {
+	[0] = "",
+	[1] = "nomask",
+};
+
+static const char *access_mode[2] = {
+	[0] = "align1",
+	[1] = "align16",
+};
+
+static const char *reg_encoding[8] = {
+	[0] = "UD",
+	[1] = "D",
+	[2] = "UW",
+	[3] = "W",
+	[4] = "UB",
+	[5] = "B",
+	[7] = "F"
+};
+
+static const int reg_type_size[8] = {
+	[0] = 4,
+	[1] = 4,
+	[2] = 2,
+	[3] = 2,
+	[4] = 1,
+	[5] = 1,
+	[7] = 4
+};
+
+static const char *reg_file[4] = {
+	[0] = "A",
+	[1] = "g",
+	[2] = "m",
+	[3] = "imm",
+};
+
+static const char *writemask[16] = {
+	[0x0] = ".",
+	[0x1] = ".x",
+	[0x2] = ".y",
+	[0x3] = ".xy",
+	[0x4] = ".z",
+	[0x5] = ".xz",
+	[0x6] = ".yz",
+	[0x7] = ".xyz",
+	[0x8] = ".w",
+	[0x9] = ".xw",
+	[0xa] = ".yw",
+	[0xb] = ".xyw",
+	[0xc] = ".zw",
+	[0xd] = ".xzw",
+	[0xe] = ".yzw",
+	[0xf] = "",
+};
+
+static const char *end_of_thread[2] = {
+	[0] = "",
+	[1] = "EOT"
+};
+
+static const char *target_function[16] = {
+	[BRW_SFID_NULL] = "null",
+	[BRW_SFID_MATH] = "math",
+	[BRW_SFID_SAMPLER] = "sampler",
+	[BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+	[BRW_SFID_DATAPORT_READ] = "read",
+	[BRW_SFID_DATAPORT_WRITE] = "write",
+	[BRW_SFID_URB] = "urb",
+	[BRW_SFID_THREAD_SPAWNER] = "thread_spawner"
+};
+
+static const char *target_function_gen6[16] = {
+	[BRW_SFID_NULL] = "null",
+	[BRW_SFID_MATH] = "math",
+	[BRW_SFID_SAMPLER] = "sampler",
+	[BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+	[BRW_SFID_URB] = "urb",
+	[BRW_SFID_THREAD_SPAWNER] = "thread_spawner",
+	[GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+	[GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+	[GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+	[GEN7_SFID_DATAPORT_DATA_CACHE] = "data"
+};
+
+static const char *dp_rc_msg_type_gen6[16] = {
+	[BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read",
+	[GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read",
+	[GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read",
+	[GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read",
+	[GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] = "OWORD unaligned block read",
+	[GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read",
+	[GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] = "OWORD dual block write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] = "DWORD scattered write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write",
+	[GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORMc write",
+};
+
+static const char *math_function[16] = {
+	[BRW_MATH_FUNCTION_INV] = "inv",
+	[BRW_MATH_FUNCTION_LOG] = "log",
+	[BRW_MATH_FUNCTION_EXP] = "exp",
+	[BRW_MATH_FUNCTION_SQRT] = "sqrt",
+	[BRW_MATH_FUNCTION_RSQ] = "rsq",
+	[BRW_MATH_FUNCTION_SIN] = "sin",
+	[BRW_MATH_FUNCTION_COS] = "cos",
+	[BRW_MATH_FUNCTION_SINCOS] = "sincos",
+	[BRW_MATH_FUNCTION_TAN] = "tan",
+	[BRW_MATH_FUNCTION_POW] = "pow",
+	[BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+	[BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
+	[BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+};
+
+static const char *math_saturate[2] = {
+	[0] = "",
+	[1] = "sat"
+};
+
+static const char *math_signed[2] = {
+	[0] = "",
+	[1] = "signed"
+};
+
+static const char *math_scalar[2] = {
+	[0] = "",
+	[1] = "scalar"
+};
+
+static const char *math_precision[2] = {
+	[0] = "",
+	[1] = "partial_precision"
+};
+
+static const char *urb_opcode[2] = {
+	[0] = "urb_write",
+	[1] = "ff_sync",
+};
+
+static const char *urb_swizzle[4] = {
+	[BRW_URB_SWIZZLE_NONE] = "",
+	[BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+	[BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+static const char *urb_allocate[2] = {
+	[0] = "",
+	[1] = "allocate"
+};
+
+static const char *urb_used[2] = {
+	[0] = "",
+	[1] = "used"
+};
+
+static const char *urb_complete[2] = {
+	[0] = "",
+	[1] = "complete"
+};
+
+static const char *sampler_target_format[4] = {
+	[0] = "F",
+	[2] = "UD",
+	[3] = "D"
+};
+
+static int column;
+
+static int string(FILE *file, const char *str)
+{
+	fputs(str, file);
+	column += strlen(str);
+	return 0;
+}
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+__attribute__((format(printf, 2, 3)))
+#endif
+static int format(FILE *f, const char *fmt, ...)
+{
+	char buf[1024];
+	va_list	args;
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf) - 1, fmt, args);
+	va_end(args);
+
+	string(f, buf);
+	return 0;
+}
+
+static void newline(FILE *f)
+{
+	putc('\n', f);
+	column = 0;
+}
+
+static void pad(FILE *f, int c)
+{
+	do
+		string(f, " ");
+	while (column < c);
+}
+
+static void control(FILE *file, const char *name, const char *ctrl[], unsigned id, int *space)
+{
+	if (!ctrl[id]) {
+		fprintf(file, "*** invalid %s value %d ",
+			name, id);
+		assert(0);
+	}
+	if (ctrl[id][0]) {
+		if (space && *space)
+			string(file, " ");
+		string(file, ctrl[id]);
+		if (space)
+			*space = 1;
+	}
+}
+
+static void print_opcode(FILE *file, int id)
+{
+	if (!opcode[id].name) {
+		format(file, "*** invalid opcode value %d ", id);
+		assert(0);
+	}
+	string(file, opcode[id].name);
+}
+
+static int reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
+{
+	/* Clear the Compr4 instruction compression bit. */
+	if (_reg_file == BRW_MESSAGE_REGISTER_FILE)
+		_reg_nr &= ~(1 << 7);
+
+	if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+		switch (_reg_nr & 0xf0) {
+		case BRW_ARF_NULL:
+			string(file, "null");
+			return -1;
+		case BRW_ARF_ADDRESS:
+			format(file, "a%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_ACCUMULATOR:
+			format(file, "acc%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_FLAG:
+			format(file, "f%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_MASK:
+			format(file, "mask%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_MASK_STACK:
+			format(file, "msd%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_STATE:
+			format(file, "sr%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_CONTROL:
+			format(file, "cr%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_NOTIFICATION_COUNT:
+			format(file, "n%d", _reg_nr & 0x0f);
+			break;
+		case BRW_ARF_IP:
+			string(file, "ip");
+			return -1;
+		default:
+			format(file, "ARF%d", _reg_nr);
+			break;
+		}
+	} else {
+		control(file, "src reg file", reg_file, _reg_file, NULL);
+		format(file, "%d", _reg_nr);
+	}
+	return 0;
+}
+
+static void dest(FILE *file, const struct brw_instruction *inst)
+{
+	if (inst->header.access_mode == BRW_ALIGN_1) {
+		if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT) {
+			if (reg(file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr))
+				return;
+
+			if (inst->bits1.da1.dest_subreg_nr)
+				format(file, ".%d", inst->bits1.da1.dest_subreg_nr /
+				       reg_type_size[inst->bits1.da1.dest_reg_type]);
+			format(file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+			control(file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+		} else {
+			string(file, "g[a0");
+			if (inst->bits1.ia1.dest_subreg_nr)
+				format(file, ".%d", inst->bits1.ia1.dest_subreg_nr /
+				       reg_type_size[inst->bits1.ia1.dest_reg_type]);
+			if (inst->bits1.ia1.dest_indirect_offset)
+				format(file, " %d", inst->bits1.ia1.dest_indirect_offset);
+			string(file, "]");
+			format(file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+			control(file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+		}
+	} else {
+		if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT) {
+			if (reg(file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr))
+				return;
+
+			if (inst->bits1.da16.dest_subreg_nr)
+				format(file, ".%d", inst->bits1.da16.dest_subreg_nr /
+				       reg_type_size[inst->bits1.da16.dest_reg_type]);
+			string(file, "<1>");
+			control(file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+			control(file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+		} else {
+			string(file, "Indirect align16 address mode not supported");
+		}
+	}
+}
+
+static void src_align1_region(FILE *file,
+			      unsigned _vert_stride, unsigned _width, unsigned _horiz_stride)
+{
+	string(file, "<");
+	control(file, "vert stride", vert_stride, _vert_stride, NULL);
+	string(file, ",");
+	control(file, "width", width, _width, NULL);
+	string(file, ",");
+	control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+	string(file, ">");
+}
+
+static void src_da1(FILE *file, unsigned type, unsigned _reg_file,
+		    unsigned _vert_stride, unsigned _width, unsigned _horiz_stride,
+		    unsigned reg_num, unsigned sub_reg_num, unsigned __abs, unsigned _negate)
+{
+	control(file, "negate", negate, _negate, NULL);
+	control(file, "abs", _abs, __abs, NULL);
+
+	if (reg(file, _reg_file, reg_num))
+		return;
+
+	if (sub_reg_num)
+		format(file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
+	src_align1_region(file, _vert_stride, _width, _horiz_stride);
+	control(file, "src reg encoding", reg_encoding, type, NULL);
+}
+
+static void src_ia1(FILE *file,
+		    unsigned type,
+		    unsigned _reg_file,
+		    int _addr_imm,
+		    unsigned _addr_subreg_nr,
+		    unsigned _negate,
+		    unsigned __abs,
+		    unsigned _addr_mode,
+		    unsigned _horiz_stride,
+		    unsigned _width,
+		    unsigned _vert_stride)
+{
+	control(file, "negate", negate, _negate, NULL);
+	control(file, "abs", _abs, __abs, NULL);
+
+	string(file, "g[a0");
+	if (_addr_subreg_nr)
+		format(file, ".%d", _addr_subreg_nr);
+	if (_addr_imm)
+		format(file, " %d", _addr_imm);
+	string(file, "]");
+	src_align1_region(file, _vert_stride, _width, _horiz_stride);
+	control(file, "src reg encoding", reg_encoding, type, NULL);
+}
+
+static void src_da16(FILE *file,
+		     unsigned _reg_type,
+		     unsigned _reg_file,
+		     unsigned _vert_stride,
+		     unsigned _reg_nr,
+		     unsigned _subreg_nr,
+		     unsigned __abs,
+		     unsigned _negate,
+		     unsigned swz_x,
+		     unsigned swz_y,
+		     unsigned swz_z,
+		     unsigned swz_w)
+{
+	control(file, "negate", negate, _negate, NULL);
+	control(file, "abs", _abs, __abs, NULL);
+
+	if (reg(file, _reg_file, _reg_nr))
+		return;
+
+	if (_subreg_nr)
+		/* bit4 for subreg number byte addressing. Make this same meaning as
+		   in da1 case, so output looks consistent. */
+		format(file, ".%d", 16 / reg_type_size[_reg_type]);
+	string(file, "<");
+	control(file, "vert stride", vert_stride, _vert_stride, NULL);
+	string(file, ",4,1>");
+	/*
+	 * Three kinds of swizzle display:
+	 *  identity - nothing printed
+	 *  1->all	 - print the single channel
+	 *  1->1     - print the mapping
+	 */
+	if (swz_x == BRW_CHANNEL_X &&
+	    swz_y == BRW_CHANNEL_Y &&
+	    swz_z == BRW_CHANNEL_Z &&
+	    swz_w == BRW_CHANNEL_W)
+	{
+		;
+	}
+	else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+	{
+		string(file, ".");
+		control(file, "channel select", chan_sel, swz_x, NULL);
+	}
+	else
+	{
+		string(file, ".");
+		control(file, "channel select", chan_sel, swz_x, NULL);
+		control(file, "channel select", chan_sel, swz_y, NULL);
+		control(file, "channel select", chan_sel, swz_z, NULL);
+		control(file, "channel select", chan_sel, swz_w, NULL);
+	}
+	control(file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+}
+
+static void imm(FILE *file, unsigned type, const struct brw_instruction *inst)
+{
+	switch (type) {
+	case BRW_REGISTER_TYPE_UD:
+		format(file, "0x%08xUD", inst->bits3.ud);
+		break;
+	case BRW_REGISTER_TYPE_D:
+		format(file, "%dD", inst->bits3.d);
+		break;
+	case BRW_REGISTER_TYPE_UW:
+		format(file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+		break;
+	case BRW_REGISTER_TYPE_W:
+		format(file, "%dW", (int16_t) inst->bits3.d);
+		break;
+	case BRW_REGISTER_TYPE_UB:
+		format(file, "0x%02xUB", (int8_t) inst->bits3.ud);
+		break;
+	case BRW_REGISTER_TYPE_VF:
+		format(file, "Vector Float");
+		break;
+	case BRW_REGISTER_TYPE_V:
+		format(file, "0x%08xV", inst->bits3.ud);
+		break;
+	case BRW_REGISTER_TYPE_F:
+		format(file, "%-gF", inst->bits3.f);
+	}
+}
+
+static void src0(FILE *file, const struct brw_instruction *inst)
+{
+	if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+		imm(file, inst->bits1.da1.src0_reg_type, inst);
+	else if (inst->header.access_mode == BRW_ALIGN_1) {
+		if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT) {
+			src_da1(file,
+				inst->bits1.da1.src0_reg_type,
+				inst->bits1.da1.src0_reg_file,
+				inst->bits2.da1.src0_vert_stride,
+				inst->bits2.da1.src0_width,
+				inst->bits2.da1.src0_horiz_stride,
+				inst->bits2.da1.src0_reg_nr,
+				inst->bits2.da1.src0_subreg_nr,
+				inst->bits2.da1.src0_abs,
+				inst->bits2.da1.src0_negate);
+		} else {
+			src_ia1(file,
+				inst->bits1.ia1.src0_reg_type,
+				inst->bits1.ia1.src0_reg_file,
+				inst->bits2.ia1.src0_indirect_offset,
+				inst->bits2.ia1.src0_subreg_nr,
+				inst->bits2.ia1.src0_negate,
+				inst->bits2.ia1.src0_abs,
+				inst->bits2.ia1.src0_address_mode,
+				inst->bits2.ia1.src0_horiz_stride,
+				inst->bits2.ia1.src0_width,
+				inst->bits2.ia1.src0_vert_stride);
+		}
+	} else {
+		if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT) {
+			src_da16(file,
+				 inst->bits1.da16.src0_reg_type,
+				 inst->bits1.da16.src0_reg_file,
+				 inst->bits2.da16.src0_vert_stride,
+				 inst->bits2.da16.src0_reg_nr,
+				 inst->bits2.da16.src0_subreg_nr,
+				 inst->bits2.da16.src0_abs,
+				 inst->bits2.da16.src0_negate,
+				 inst->bits2.da16.src0_swz_x,
+				 inst->bits2.da16.src0_swz_y,
+				 inst->bits2.da16.src0_swz_z,
+				 inst->bits2.da16.src0_swz_w);
+		} else {
+			string(file, "Indirect align16 address mode not supported");
+		}
+	}
+}
+
+static void src1(FILE *file, const struct brw_instruction *inst)
+{
+	if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+		imm(file, inst->bits1.da1.src1_reg_type, inst);
+	else if (inst->header.access_mode == BRW_ALIGN_1) {
+		if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT) {
+			src_da1(file,
+				inst->bits1.da1.src1_reg_type,
+				inst->bits1.da1.src1_reg_file,
+				inst->bits3.da1.src1_vert_stride,
+				inst->bits3.da1.src1_width,
+				inst->bits3.da1.src1_horiz_stride,
+				inst->bits3.da1.src1_reg_nr,
+				inst->bits3.da1.src1_subreg_nr,
+				inst->bits3.da1.src1_abs,
+				inst->bits3.da1.src1_negate);
+		} else {
+			src_ia1(file,
+				inst->bits1.ia1.src1_reg_type,
+				inst->bits1.ia1.src1_reg_file,
+				inst->bits3.ia1.src1_indirect_offset,
+				inst->bits3.ia1.src1_subreg_nr,
+				inst->bits3.ia1.src1_negate,
+				inst->bits3.ia1.src1_abs,
+				inst->bits3.ia1.src1_address_mode,
+				inst->bits3.ia1.src1_horiz_stride,
+				inst->bits3.ia1.src1_width,
+				inst->bits3.ia1.src1_vert_stride);
+		}
+	} else {
+		if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT) {
+			src_da16(file,
+				 inst->bits1.da16.src1_reg_type,
+				 inst->bits1.da16.src1_reg_file,
+				 inst->bits3.da16.src1_vert_stride,
+				 inst->bits3.da16.src1_reg_nr,
+				 inst->bits3.da16.src1_subreg_nr,
+				 inst->bits3.da16.src1_abs,
+				 inst->bits3.da16.src1_negate,
+				 inst->bits3.da16.src1_swz_x,
+				 inst->bits3.da16.src1_swz_y,
+				 inst->bits3.da16.src1_swz_z,
+				 inst->bits3.da16.src1_swz_w);
+		} else {
+			string(file, "Indirect align16 address mode not supported");
+		}
+	}
+}
+
+static const int esize[6] = {
+	[0] = 1,
+	[1] = 2,
+	[2] = 4,
+	[3] = 8,
+	[4] = 16,
+	[5] = 32,
+};
+
+static int qtr_ctrl(FILE *file, const struct brw_instruction *inst)
+{
+	int qtr_ctl = inst->header.compression_control;
+	int size = esize[inst->header.execution_size];
+
+	if (size == 8) {
+		switch (qtr_ctl) {
+		case 0:
+			string(file, " 1Q");
+			break;
+		case 1:
+			string(file, " 2Q");
+			break;
+		case 2:
+			string(file, " 3Q");
+			break;
+		case 3:
+			string(file, " 4Q");
+			break;
+		}
+	} else if (size == 16){
+		if (qtr_ctl < 2)
+			string(file, " 1H");
+		else
+			string(file, " 2H");
+	}
+	return 0;
+}
+
+void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
+{
+	int space = 0;
+
+	format(file, "%08x %08x %08x %08x\n",
+	       ((const uint32_t*)inst)[0],
+	       ((const uint32_t*)inst)[1],
+	       ((const uint32_t*)inst)[2],
+	       ((const uint32_t*)inst)[3]);
+
+	if (inst->header.predicate_control) {
+		string(file, "(");
+		control(file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+		string(file, "f0");
+		if (inst->bits2.da1.flag_subreg_nr)
+			format(file, ".%d", inst->bits2.da1.flag_subreg_nr);
+		if (inst->header.access_mode == BRW_ALIGN_1)
+			control(file, "predicate control align1", pred_ctrl_align1,
+				inst->header.predicate_control, NULL);
+		else
+			control(file, "predicate control align16", pred_ctrl_align16,
+				inst->header.predicate_control, NULL);
+		string(file, ") ");
+	}
+
+	print_opcode(file, inst->header.opcode);
+	control(file, "saturate", saturate, inst->header.saturate, NULL);
+	control(file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+	if (inst->header.opcode == BRW_OPCODE_MATH) {
+		string(file, " ");
+		control(file, "function", math_function,
+			inst->header.destreg__conditionalmod, NULL);
+	} else if (inst->header.opcode != BRW_OPCODE_SEND &&
+		   inst->header.opcode != BRW_OPCODE_SENDC)
+		control(file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+	if (inst->header.opcode != BRW_OPCODE_NOP) {
+		string(file, "(");
+		control(file, "execution size", exec_size, inst->header.execution_size, NULL);
+		string(file, ")");
+	}
+
+	if (inst->header.opcode == BRW_OPCODE_SEND && gen < 60)
+		format(file, " %d", inst->header.destreg__conditionalmod);
+
+	if (opcode[inst->header.opcode].ndst > 0) {
+		pad(file, 16);
+		dest(file, inst);
+	} else if (gen >= 60 && (inst->header.opcode == BRW_OPCODE_IF ||
+				 inst->header.opcode == BRW_OPCODE_ELSE ||
+				 inst->header.opcode == BRW_OPCODE_ENDIF ||
+				 inst->header.opcode == BRW_OPCODE_WHILE)) {
+		format(file, " %d", inst->bits1.branch_gen6.jump_count);
+	}
+
+	if (opcode[inst->header.opcode].nsrc > 0) {
+		pad(file, 32);
+		src0(file, inst);
+	}
+	if (opcode[inst->header.opcode].nsrc > 1) {
+		pad(file, 48);
+		src1(file, inst);
+	}
+
+	if (inst->header.opcode == BRW_OPCODE_SEND ||
+	    inst->header.opcode == BRW_OPCODE_SENDC) {
+		enum brw_message_target target;
+
+		if (gen >= 60)
+			target = inst->header.destreg__conditionalmod;
+		else if (gen >= 50)
+			target = inst->bits2.send_gen5.sfid;
+		else
+			target = inst->bits3.generic.msg_target;
+
+		newline (file);
+		pad (file, 16);
+		space = 0;
+
+		if (gen >= 60) {
+			control (file, "target function", target_function_gen6,
+				 target, &space);
+		} else {
+			control (file, "target function", target_function,
+				 target, &space);
+		}
+
+		switch (target) {
+		case BRW_SFID_MATH:
+			control (file, "math function", math_function,
+				 inst->bits3.math.function, &space);
+			control (file, "math saturate", math_saturate,
+				 inst->bits3.math.saturate, &space);
+			control (file, "math signed", math_signed,
+				 inst->bits3.math.int_type, &space);
+			control (file, "math scalar", math_scalar,
+				 inst->bits3.math.data_type, &space);
+			control (file, "math precision", math_precision,
+				 inst->bits3.math.precision, &space);
+			break;
+		case BRW_SFID_SAMPLER:
+			if (gen >= 70) {
+				format (file, " (%d, %d, %d, %d)",
+					inst->bits3.sampler_gen7.binding_table_index,
+					inst->bits3.sampler_gen7.sampler,
+					inst->bits3.sampler_gen7.msg_type,
+					inst->bits3.sampler_gen7.simd_mode);
+			} else if (gen >= 50) {
+				format (file, " (%d, %d, %d, %d)",
+					inst->bits3.sampler_gen5.binding_table_index,
+					inst->bits3.sampler_gen5.sampler,
+					inst->bits3.sampler_gen5.msg_type,
+					inst->bits3.sampler_gen5.simd_mode);
+			} else if (gen >= 45) {
+				format (file, " (%d, %d)",
+					inst->bits3.sampler_g4x.binding_table_index,
+					inst->bits3.sampler_g4x.sampler);
+			} else {
+				format (file, " (%d, %d, ",
+					inst->bits3.sampler.binding_table_index,
+					inst->bits3.sampler.sampler);
+				control (file, "sampler target format",
+					 sampler_target_format,
+					 inst->bits3.sampler.return_format, NULL);
+				string (file, ")");
+			}
+			break;
+		case BRW_SFID_DATAPORT_READ:
+			if (gen >= 60) {
+				format (file, " (%d, %d, %d, %d)",
+					inst->bits3.gen6_dp.binding_table_index,
+					inst->bits3.gen6_dp.msg_control,
+					inst->bits3.gen6_dp.msg_type,
+					inst->bits3.gen6_dp.send_commit_msg);
+			} else if (gen >= 45) {
+				format (file, " (%d, %d, %d)",
+					inst->bits3.dp_read_gen5.binding_table_index,
+					inst->bits3.dp_read_gen5.msg_control,
+					inst->bits3.dp_read_gen5.msg_type);
+			} else {
+				format (file, " (%d, %d, %d)",
+					inst->bits3.dp_read.binding_table_index,
+					inst->bits3.dp_read.msg_control,
+					inst->bits3.dp_read.msg_type);
+			}
+			break;
+
+		case BRW_SFID_DATAPORT_WRITE:
+			if (gen >= 70) {
+				format (file, " (");
+
+				control (file, "DP rc message type",
+					 dp_rc_msg_type_gen6,
+					 inst->bits3.gen7_dp.msg_type, &space);
+
+				format (file, ", %d, %d, %d)",
+					inst->bits3.gen7_dp.binding_table_index,
+					inst->bits3.gen7_dp.msg_control,
+					inst->bits3.gen7_dp.msg_type);
+			} else if (gen >= 60) {
+				format (file, " (");
+
+				control (file, "DP rc message type",
+					 dp_rc_msg_type_gen6,
+					 inst->bits3.gen6_dp.msg_type, &space);
+
+				format (file, ", %d, %d, %d, %d)",
+					inst->bits3.gen6_dp.binding_table_index,
+					inst->bits3.gen6_dp.msg_control,
+					inst->bits3.gen6_dp.msg_type,
+					inst->bits3.gen6_dp.send_commit_msg);
+			} else {
+				format (file, " (%d, %d, %d, %d)",
+					inst->bits3.dp_write.binding_table_index,
+					(inst->bits3.dp_write.last_render_target << 3) |
+					inst->bits3.dp_write.msg_control,
+					inst->bits3.dp_write.msg_type,
+					inst->bits3.dp_write.send_commit_msg);
+			}
+			break;
+
+		case BRW_SFID_URB:
+			if (gen >= 50) {
+				format (file, " %d", inst->bits3.urb_gen5.offset);
+			} else {
+				format (file, " %d", inst->bits3.urb.offset);
+			}
+
+			space = 1;
+			if (gen >= 50) {
+				control (file, "urb opcode", urb_opcode,
+					 inst->bits3.urb_gen5.opcode, &space);
+			}
+			control (file, "urb swizzle", urb_swizzle,
+				 inst->bits3.urb.swizzle_control, &space);
+			control (file, "urb allocate", urb_allocate,
+				 inst->bits3.urb.allocate, &space);
+			control (file, "urb used", urb_used,
+				 inst->bits3.urb.used, &space);
+			control (file, "urb complete", urb_complete,
+				 inst->bits3.urb.complete, &space);
+			break;
+		case BRW_SFID_THREAD_SPAWNER:
+			break;
+		case GEN7_SFID_DATAPORT_DATA_CACHE:
+			format (file, " (%d, %d, %d)",
+				inst->bits3.gen7_dp.binding_table_index,
+				inst->bits3.gen7_dp.msg_control,
+				inst->bits3.gen7_dp.msg_type);
+			break;
+
+
+		default:
+			format (file, "unsupported target %d", target);
+			break;
+		}
+		if (space)
+			string (file, " ");
+		if (gen >= 50) {
+			format (file, "mlen %d",
+				inst->bits3.generic_gen5.msg_length);
+			format (file, " rlen %d",
+				inst->bits3.generic_gen5.response_length);
+		} else {
+			format (file, "mlen %d",
+				inst->bits3.generic.msg_length);
+			format (file, " rlen %d",
+				inst->bits3.generic.response_length);
+		}
+	}
+	pad(file, 64);
+	if (inst->header.opcode != BRW_OPCODE_NOP) {
+		string(file, "{");
+		space = 1;
+		control(file, "access mode", access_mode, inst->header.access_mode, &space);
+		if (gen >= 60)
+			control(file, "write enable control", wectrl, inst->header.mask_control, &space);
+		else
+			control(file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+		control(file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+
+		if (gen >= 60)
+			qtr_ctrl(file, inst);
+		else {
+			if (inst->header.compression_control == BRW_COMPRESSION_COMPRESSED &&
+			    opcode[inst->header.opcode].ndst > 0 &&
+			    inst->bits1.da1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE &&
+			    inst->bits1.da1.dest_reg_nr & (1 << 7)) {
+				format(file, " compr4");
+			} else {
+				control(file, "compression control", compr_ctrl,
+					inst->header.compression_control, &space);
+			}
+		}
+
+		control(file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+		if (gen >= 60)
+			control(file, "acc write control", accwr, inst->header.acc_wr_control, &space);
+		if (inst->header.opcode == BRW_OPCODE_SEND ||
+		    inst->header.opcode == BRW_OPCODE_SENDC)
+			control(file, "end of thread", end_of_thread,
+				inst->bits3.generic.end_of_thread, &space);
+		if (space)
+			string(file, " ");
+		string(file, "}");
+	}
+	string(file, ";");
+	newline(file);
+}
diff --git a/cogl/driver/drm/brw/brw_eu.c b/cogl/driver/drm/brw/brw_eu.c
new file mode 100644
index 00000000..7c32ea19
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_eu.c
@@ -0,0 +1,150 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_eu.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+/* Returns the corresponding conditional mod for swapping src0 and
+ * src1 in e.g. CMP.
+ */
+uint32_t
+brw_swap_cmod(uint32_t cmod)
+{
+	switch (cmod) {
+	case BRW_CONDITIONAL_Z:
+	case BRW_CONDITIONAL_NZ:
+		return cmod;
+	case BRW_CONDITIONAL_G:
+		return BRW_CONDITIONAL_LE;
+	case BRW_CONDITIONAL_GE:
+		return BRW_CONDITIONAL_L;
+	case BRW_CONDITIONAL_L:
+		return BRW_CONDITIONAL_GE;
+	case BRW_CONDITIONAL_LE:
+		return BRW_CONDITIONAL_G;
+	default:
+		return ~0;
+	}
+}
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value )
+{
+	p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+	if (value != 0xff) {
+		if (value != p->flag_value) {
+			brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+			p->flag_value = value;
+		}
+
+		p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+	}
+}
+
+void brw_set_compression_control(struct brw_compile *p,
+				 enum brw_compression compression_control)
+{
+	p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED);
+
+	if (p->gen >= 60) {
+		/* Since we don't use the 32-wide support in gen6, we translate
+		 * the pre-gen6 compression control here.
+		 */
+		switch (compression_control) {
+		case BRW_COMPRESSION_NONE:
+			/* This is the "use the first set of bits of dmask/vmask/arf
+			 * according to execsize" option.
+			 */
+			p->current->header.compression_control = GEN6_COMPRESSION_1Q;
+			break;
+		case BRW_COMPRESSION_2NDHALF:
+			/* For 8-wide, this is "use the second set of 8 bits." */
+			p->current->header.compression_control = GEN6_COMPRESSION_2Q;
+			break;
+		case BRW_COMPRESSION_COMPRESSED:
+			/* For 16-wide instruction compression, use the first set of 16 bits
+			 * since we don't do 32-wide dispatch.
+			 */
+			p->current->header.compression_control = GEN6_COMPRESSION_1H;
+			break;
+		default:
+			assert(!"not reached");
+			p->current->header.compression_control = GEN6_COMPRESSION_1H;
+			break;
+		}
+	} else {
+		p->current->header.compression_control = compression_control;
+	}
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+	assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+	memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+	p->compressed_stack[p->current - p->stack] = p->compressed;
+	p->current++;
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+	assert(p->current != p->stack);
+	p->current--;
+	p->compressed = p->compressed_stack[p->current - p->stack];
+}
+
+void brw_compile_init(struct brw_compile *p, int gen, void *store)
+{
+	assert(gen);
+
+	p->gen = gen;
+	p->store = store;
+
+	p->nr_insn = 0;
+	p->current = p->stack;
+	p->compressed = false;
+	memset(p->current, 0, sizeof(p->current[0]));
+
+	/* Some defaults?
+	*/
+	brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+	brw_set_saturate(p, 0);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_set_predicate_control_flag_value(p, 0xff);
+
+	p->if_stack_depth = 0;
+	p->if_stack_array_size = 0;
+	p->if_stack = NULL;
+}
diff --git a/cogl/driver/drm/brw/brw_eu.h b/cogl/driver/drm/brw/brw_eu.h
new file mode 100644
index 00000000..65e66d5e
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_eu.h
@@ -0,0 +1,2266 @@
+/*
+   Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+   Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+   develop this 3D driver.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice (including the
+   next paragraph) shall be included in all copies or substantial
+   portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+#define WRITEMASK_X 0x1
+#define WRITEMASK_Y 0x2
+#define WRITEMASK_Z 0x4
+#define WRITEMASK_W 0x8
+
+#define WRITEMASK_XY (WRITEMASK_X | WRITEMASK_Y)
+#define WRITEMASK_XYZ (WRITEMASK_X | WRITEMASK_Y | WRITEMASK_Z)
+#define WRITEMASK_XYZW (WRITEMASK_X | WRITEMASK_Y | WRITEMASK_Z | WRITEMASK_W)
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+enum brw_compression {
+	BRW_COMPRESSION_NONE,
+	BRW_COMPRESSION_2NDHALF,
+	BRW_COMPRESSION_COMPRESSED,
+};
+
+#define GEN6_COMPRESSION_1Q		0
+#define GEN6_COMPRESSION_2Q		1
+#define GEN6_COMPRESSION_3Q		2
+#define GEN6_COMPRESSION_4Q		3
+#define GEN6_COMPRESSION_1H		0
+#define GEN6_COMPRESSION_2H		2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_R     7
+#define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+/** @{
+ *
+ * Gen6 has replaced "mask enable/disable" with WECtrl, which is
+ * effectively the same but much simpler to think about.  Now, there
+ * are two contributors ANDed together to whether channels are
+ * executed: The predication on the instruction, and the channel write
+ * enable.
+ */
+/**
+ * This is the default value.  It means that a channel's write enable is set
+ * if the per-channel IP is pointing at this instruction.
+ */
+#define BRW_WE_NORMAL		0
+/**
+ * This is used like BRW_MASK_DISABLE, and causes all channels to have
+ * their write enable set.  Note that predication still contributes to
+ * whether the channel actually gets written.
+ */
+#define BRW_WE_ALL		1
+/** @} */
+
+enum opcode {
+	/* These are the actual hardware opcodes. */
+	BRW_OPCODE_MOV =	1,
+	BRW_OPCODE_SEL =	2,
+	BRW_OPCODE_NOT =	4,
+	BRW_OPCODE_AND =	5,
+	BRW_OPCODE_OR =	6,
+	BRW_OPCODE_XOR =	7,
+	BRW_OPCODE_SHR =	8,
+	BRW_OPCODE_SHL =	9,
+	BRW_OPCODE_RSR =	10,
+	BRW_OPCODE_RSL =	11,
+	BRW_OPCODE_ASR =	12,
+	BRW_OPCODE_CMP =	16,
+	BRW_OPCODE_CMPN =	17,
+	BRW_OPCODE_JMPI =	32,
+	BRW_OPCODE_IF =	34,
+	BRW_OPCODE_IFF =	35,
+	BRW_OPCODE_ELSE =	36,
+	BRW_OPCODE_ENDIF =	37,
+	BRW_OPCODE_DO =	38,
+	BRW_OPCODE_WHILE =	39,
+	BRW_OPCODE_BREAK =	40,
+	BRW_OPCODE_CONTINUE = 41,
+	BRW_OPCODE_HALT =	42,
+	BRW_OPCODE_MSAVE =	44,
+	BRW_OPCODE_MRESTORE = 45,
+	BRW_OPCODE_PUSH =	46,
+	BRW_OPCODE_POP =	47,
+	BRW_OPCODE_WAIT =	48,
+	BRW_OPCODE_SEND =	49,
+	BRW_OPCODE_SENDC =	50,
+	BRW_OPCODE_MATH =	56,
+	BRW_OPCODE_ADD =	64,
+	BRW_OPCODE_MUL =	65,
+	BRW_OPCODE_AVG =	66,
+	BRW_OPCODE_FRC =	67,
+	BRW_OPCODE_RNDU =	68,
+	BRW_OPCODE_RNDD =	69,
+	BRW_OPCODE_RNDE =	70,
+	BRW_OPCODE_RNDZ =	71,
+	BRW_OPCODE_MAC =	72,
+	BRW_OPCODE_MACH =	73,
+	BRW_OPCODE_LZD =	74,
+	BRW_OPCODE_SAD2 =	80,
+	BRW_OPCODE_SADA2 =	81,
+	BRW_OPCODE_DP4 =	84,
+	BRW_OPCODE_DPH =	85,
+	BRW_OPCODE_DP3 =	86,
+	BRW_OPCODE_DP2 =	87,
+	BRW_OPCODE_DPA2 =	88,
+	BRW_OPCODE_LINE =	89,
+	BRW_OPCODE_PLN =	90,
+	BRW_OPCODE_NOP =	126,
+
+	/* These are compiler backend opcodes that get translated into other
+	 * instructions.
+	 */
+	FS_OPCODE_FB_WRITE = 128,
+	SHADER_OPCODE_RCP,
+	SHADER_OPCODE_RSQ,
+	SHADER_OPCODE_SQRT,
+	SHADER_OPCODE_EXP2,
+	SHADER_OPCODE_LOG2,
+	SHADER_OPCODE_POW,
+	SHADER_OPCODE_SIN,
+	SHADER_OPCODE_COS,
+	FS_OPCODE_DDX,
+	FS_OPCODE_DDY,
+	FS_OPCODE_PIXEL_X,
+	FS_OPCODE_PIXEL_Y,
+	FS_OPCODE_CINTERP,
+	FS_OPCODE_LINTERP,
+	FS_OPCODE_TEX,
+	FS_OPCODE_TXB,
+	FS_OPCODE_TXD,
+	FS_OPCODE_TXF,
+	FS_OPCODE_TXL,
+	FS_OPCODE_TXS,
+	FS_OPCODE_DISCARD,
+	FS_OPCODE_SPILL,
+	FS_OPCODE_UNSPILL,
+	FS_OPCODE_PULL_CONSTANT_LOAD,
+
+	VS_OPCODE_URB_WRITE,
+	VS_OPCODE_SCRATCH_READ,
+	VS_OPCODE_SCRATCH_WRITE,
+	VS_OPCODE_PULL_CONSTANT_LOAD,
+};
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_MRF_COMPR4			(1 << 7)
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1 /* reserved on GEN6 */
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE  4
+#define GEN6_MESSAGE_TARGET_DP_RENDER_CACHE   5
+#define GEN6_MESSAGE_TARGET_DP_CONST_CACHE    9
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SAMPLE	              0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define GEN5_SAMPLER_MESSAGE_SAMPLE              0
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD          2
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE      3
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS       4
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
+
+/* for GEN5 only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+/* This one stays the same across generations. */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+/* GEN4 */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+/* G45, GEN5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ	    3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+/* GEN6 */
+#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ  5
+#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+/**
+ * Message target: Shared Function ID for where to SEND a message.
+ *
+ * These are enumerated in the ISA reference under "send - Send Message".
+ * In particular, see the following tables:
+ * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition"
+ * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor"
+ * - BSpec, Volume 1a (GPU Overview) / Graphics Processing Engine (GPE) /
+ *   Overview / GPE Function IDs
+ */
+enum brw_message_target {
+   BRW_SFID_NULL                     = 0,
+   BRW_SFID_MATH                     = 1, /* Only valid on Gen4-5 */
+   BRW_SFID_SAMPLER                  = 2,
+   BRW_SFID_MESSAGE_GATEWAY          = 3,
+   BRW_SFID_DATAPORT_READ            = 4,
+   BRW_SFID_DATAPORT_WRITE           = 5,
+   BRW_SFID_URB                      = 6,
+   BRW_SFID_THREAD_SPAWNER           = 7,
+
+   GEN6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
+   GEN6_SFID_DATAPORT_RENDER_CACHE   = 5,
+   GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+
+   GEN7_SFID_DATAPORT_DATA_CACHE     = 10,
+};
+
+#define GEN7_MESSAGE_TARGET_DP_DATA_CACHE     10
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+/* GEN6 */
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE              7
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE               8
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE          9
+#define GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE               10
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE           11
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE             12
+#define GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE               13
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE       14
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9 /* gen4 */
+#define BRW_MATH_FUNCTION_FDIV                             9 /* gen6+ */
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+#define REG_SIZE (8*4)
+
+struct brw_instruction {
+	struct {
+		unsigned opcode:7;
+		unsigned pad:1;
+		unsigned access_mode:1;
+		unsigned mask_control:1;
+		unsigned dependency_control:2;
+		unsigned compression_control:2; /* gen6: quater control */
+		unsigned thread_control:2;
+		unsigned predicate_control:4;
+		unsigned predicate_inverse:1;
+		unsigned execution_size:3;
+		/**
+		 * Conditional Modifier for most instructions.  On Gen6+, this is also
+		 * used for the SEND instruction's Message Target/SFID.
+		 */
+		unsigned destreg__conditionalmod:4;
+		unsigned acc_wr_control:1;
+		unsigned cmpt_control:1;
+		unsigned debug_control:1;
+		unsigned saturate:1;
+	} header;
+
+	union {
+		struct {
+			unsigned dest_reg_file:2;
+			unsigned dest_reg_type:3;
+			unsigned src0_reg_file:2;
+			unsigned src0_reg_type:3;
+			unsigned src1_reg_file:2;
+			unsigned src1_reg_type:3;
+			unsigned pad:1;
+			unsigned dest_subreg_nr:5;
+			unsigned dest_reg_nr:8;
+			unsigned dest_horiz_stride:2;
+			unsigned dest_address_mode:1;
+		} da1;
+
+		struct {
+			unsigned dest_reg_file:2;
+			unsigned dest_reg_type:3;
+			unsigned src0_reg_file:2;
+			unsigned src0_reg_type:3;
+			unsigned src1_reg_file:2;        /* 0x00000c00 */
+			unsigned src1_reg_type:3;        /* 0x00007000 */
+			unsigned pad:1;
+			int dest_indirect_offset:10;	/* offset against the deref'd address reg */
+			unsigned dest_subreg_nr:3; /* subnr for the address reg a0.x */
+			unsigned dest_horiz_stride:2;
+			unsigned dest_address_mode:1;
+		} ia1;
+
+		struct {
+			unsigned dest_reg_file:2;
+			unsigned dest_reg_type:3;
+			unsigned src0_reg_file:2;
+			unsigned src0_reg_type:3;
+			unsigned src1_reg_file:2;
+			unsigned src1_reg_type:3;
+			unsigned pad:1;
+			unsigned dest_writemask:4;
+			unsigned dest_subreg_nr:1;
+			unsigned dest_reg_nr:8;
+			unsigned dest_horiz_stride:2;
+			unsigned dest_address_mode:1;
+		} da16;
+
+		struct {
+			unsigned dest_reg_file:2;
+			unsigned dest_reg_type:3;
+			unsigned src0_reg_file:2;
+			unsigned src0_reg_type:3;
+			unsigned pad0:6;
+			unsigned dest_writemask:4;
+			int dest_indirect_offset:6;
+			unsigned dest_subreg_nr:3;
+			unsigned dest_horiz_stride:2;
+			unsigned dest_address_mode:1;
+		} ia16;
+
+		struct {
+			unsigned dest_reg_file:2;
+			unsigned dest_reg_type:3;
+			unsigned src0_reg_file:2;
+			unsigned src0_reg_type:3;
+			unsigned src1_reg_file:2;
+			unsigned src1_reg_type:3;
+			unsigned pad:1;
+
+			int jump_count:16;
+		} branch_gen6;
+
+		struct {
+			unsigned dest_reg_file:1;
+			unsigned flag_subreg_num:1;
+			unsigned pad0:2;
+			unsigned src0_abs:1;
+			unsigned src0_negate:1;
+			unsigned src1_abs:1;
+			unsigned src1_negate:1;
+			unsigned src2_abs:1;
+			unsigned src2_negate:1;
+			unsigned pad1:7;
+			unsigned dest_writemask:4;
+			unsigned dest_subreg_nr:3;
+			unsigned dest_reg_nr:8;
+		} da3src;
+	} bits1;
+
+
+	union {
+		struct {
+			unsigned src0_subreg_nr:5;
+			unsigned src0_reg_nr:8;
+			unsigned src0_abs:1;
+			unsigned src0_negate:1;
+			unsigned src0_address_mode:1;
+			unsigned src0_horiz_stride:2;
+			unsigned src0_width:3;
+			unsigned src0_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad:5;
+		} da1;
+
+		struct {
+			int src0_indirect_offset:10;
+			unsigned src0_subreg_nr:3;
+			unsigned src0_abs:1;
+			unsigned src0_negate:1;
+			unsigned src0_address_mode:1;
+			unsigned src0_horiz_stride:2;
+			unsigned src0_width:3;
+			unsigned src0_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad:5;
+		} ia1;
+
+		struct {
+			unsigned src0_swz_x:2;
+			unsigned src0_swz_y:2;
+			unsigned src0_subreg_nr:1;
+			unsigned src0_reg_nr:8;
+			unsigned src0_abs:1;
+			unsigned src0_negate:1;
+			unsigned src0_address_mode:1;
+			unsigned src0_swz_z:2;
+			unsigned src0_swz_w:2;
+			unsigned pad0:1;
+			unsigned src0_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad1:5;
+		} da16;
+
+		struct {
+			unsigned src0_swz_x:2;
+			unsigned src0_swz_y:2;
+			int src0_indirect_offset:6;
+			unsigned src0_subreg_nr:3;
+			unsigned src0_abs:1;
+			unsigned src0_negate:1;
+			unsigned src0_address_mode:1;
+			unsigned src0_swz_z:2;
+			unsigned src0_swz_w:2;
+			unsigned pad0:1;
+			unsigned src0_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad1:5;
+		} ia16;
+
+		/* Extended Message Descriptor for Ironlake (Gen5) SEND instruction.
+		 *
+		 * Does not apply to Gen6+.  The SFID/message target moved to bits
+		 * 27:24 of the header (destreg__conditionalmod); EOT is in bits3.
+		 */
+		struct {
+			unsigned pad:26;
+			unsigned end_of_thread:1;
+			unsigned pad1:1;
+			unsigned sfid:4;
+		} send_gen5;  /* for Ironlake only */
+
+		struct {
+			unsigned src0_rep_ctrl:1;
+			unsigned src0_swizzle:8;
+			unsigned src0_subreg_nr:3;
+			unsigned src0_reg_nr:8;
+			unsigned pad0:1;
+			unsigned src1_rep_ctrl:1;
+			unsigned src1_swizzle:8;
+			unsigned src1_subreg_nr_low:2;
+		} da3src;
+	} bits2;
+
+	union {
+		struct {
+			unsigned src1_subreg_nr:5;
+			unsigned src1_reg_nr:8;
+			unsigned src1_abs:1;
+			unsigned src1_negate:1;
+			unsigned src1_address_mode:1;
+			unsigned src1_horiz_stride:2;
+			unsigned src1_width:3;
+			unsigned src1_vert_stride:4;
+			unsigned pad0:7;
+		} da1;
+
+		struct {
+			unsigned src1_swz_x:2;
+			unsigned src1_swz_y:2;
+			unsigned src1_subreg_nr:1;
+			unsigned src1_reg_nr:8;
+			unsigned src1_abs:1;
+			unsigned src1_negate:1;
+			unsigned src1_address_mode:1;
+			unsigned src1_swz_z:2;
+			unsigned src1_swz_w:2;
+			unsigned pad1:1;
+			unsigned src1_vert_stride:4;
+			unsigned pad2:7;
+		} da16;
+
+		struct {
+			int src1_indirect_offset:10;
+			unsigned src1_subreg_nr:3;
+			unsigned src1_abs:1;
+			unsigned src1_negate:1;
+			unsigned src1_address_mode:1;
+			unsigned src1_horiz_stride:2;
+			unsigned src1_width:3;
+			unsigned src1_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad1:5;
+		} ia1;
+
+		struct {
+			unsigned src1_swz_x:2;
+			unsigned src1_swz_y:2;
+			int  src1_indirect_offset:6;
+			unsigned src1_subreg_nr:3;
+			unsigned src1_abs:1;
+			unsigned src1_negate:1;
+			unsigned pad0:1;
+			unsigned src1_swz_z:2;
+			unsigned src1_swz_w:2;
+			unsigned pad1:1;
+			unsigned src1_vert_stride:4;
+			unsigned flag_subreg_nr:1;
+			unsigned flag_reg_nr:1;
+			unsigned pad2:5;
+		} ia16;
+
+		struct {
+			int jump_count:16;	/* note: signed */
+			unsigned pop_count:4;
+			unsigned pad0:12;
+		} if_else;
+
+		/* This is also used for gen7 IF/ELSE instructions */
+		struct {
+			/* Signed jump distance to the ip to jump to if all channels
+			 * are disabled after the break or continue.  It should point
+			 * to the end of the innermost control flow block, as that's
+			 * where some channel could get re-enabled.
+			 */
+			int jip:16;
+
+			/* Signed jump distance to the location to resume execution
+			 * of this channel if it's enabled for the break or continue.
+			 */
+			int uip:16;
+		} break_cont;
+
+		/**
+		 * \defgroup SEND instructions / Message Descriptors
+		 *
+		 * @{
+		 */
+
+		/**
+		 * Generic Message Descriptor for Gen4 SEND instructions.  The structs
+		 * below expand function_control to something specific for their
+		 * message.  Due to struct packing issues, they duplicate these bits.
+		 *
+		 * See the G45 PRM, Volume 4, Table 14-15.
+		 */
+		struct {
+			unsigned function_control:16;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} generic;
+
+		/**
+		 * Generic Message Descriptor for Gen5-7 SEND instructions.
+		 *
+		 * See the Sandybridge PRM, Volume 2 Part 2, Table 8-15.  (Sadly, most
+		 * of the information on the SEND instruction is missing from the public
+		 * Ironlake PRM.)
+		 *
+		 * The table claims that bit 31 is reserved/MBZ on Gen6+, but it lies.
+		 * According to the SEND instruction description:
+		 * "The MSb of the message description, the EOT field, always comes from
+		 *  bit 127 of the instruction word"...which is bit 31 of this field.
+		 */
+		struct {
+			unsigned function_control:19;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} generic_gen5;
+
+		/** G45 PRM, Volume 4, Section 6.1.1.1 */
+		struct {
+			unsigned function:4;
+			unsigned int_type:1;
+			unsigned precision:1;
+			unsigned saturate:1;
+			unsigned data_type:1;
+			unsigned pad0:8;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} math;
+
+		/** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+		struct {
+			unsigned function:4;
+			unsigned int_type:1;
+			unsigned precision:1;
+			unsigned saturate:1;
+			unsigned data_type:1;
+			unsigned snapshot:1;
+			unsigned pad0:10;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} math_gen5;
+
+		/** G45 PRM, Volume 4, Section 4.8.1.1.1 [DevBW] and [DevCL] */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned sampler:4;
+			unsigned return_format:2;
+			unsigned msg_type:2;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} sampler;
+
+		/** G45 PRM, Volume 4, Section 4.8.1.1.2 [DevCTG] */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned sampler:4;
+			unsigned msg_type:4;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} sampler_g4x;
+
+		/** Ironlake PRM, Volume 4 Part 1, Section 4.11.1.1.3 */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned sampler:4;
+			unsigned msg_type:4;
+			unsigned simd_mode:2;
+			unsigned pad0:1;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} sampler_gen5;
+
+		struct {
+			unsigned binding_table_index:8;
+			unsigned sampler:4;
+			unsigned msg_type:5;
+			unsigned simd_mode:2;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} sampler_gen7;
+
+		struct brw_urb_immediate {
+			unsigned opcode:4;
+			unsigned offset:6;
+			unsigned swizzle_control:2;
+			unsigned pad:1;
+			unsigned allocate:1;
+			unsigned used:1;
+			unsigned complete:1;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} urb;
+
+		struct {
+			unsigned opcode:4;
+			unsigned offset:6;
+			unsigned swizzle_control:2;
+			unsigned pad:1;
+			unsigned allocate:1;
+			unsigned used:1;
+			unsigned complete:1;
+			unsigned pad0:3;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} urb_gen5;
+
+		struct {
+			unsigned opcode:3;
+			unsigned offset:11;
+			unsigned swizzle_control:1;
+			unsigned complete:1;
+			unsigned per_slot_offset:1;
+			unsigned pad0:2;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} urb_gen7;
+
+		/** 965 PRM, Volume 4, Section 5.10.1.1: Message Descriptor */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:4;
+			unsigned msg_type:2;
+			unsigned target_cache:2;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} dp_read;
+
+		/** G45 PRM, Volume 4, Section 5.10.1.1.2 */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned msg_type:3;
+			unsigned target_cache:2;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} dp_read_g4x;
+
+		/** Ironlake PRM, Volume 4 Part 1, Section 5.10.2.1.2. */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned msg_type:3;
+			unsigned target_cache:2;
+			unsigned pad0:3;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} dp_read_gen5;
+
+		/** G45 PRM, Volume 4, Section 5.10.1.1.2.  For both Gen4 and G45. */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned last_render_target:1;
+			unsigned msg_type:3;
+			unsigned send_commit_msg:1;
+			unsigned response_length:4;
+			unsigned msg_length:4;
+			unsigned msg_target:4;
+			unsigned pad1:3;
+			unsigned end_of_thread:1;
+		} dp_write;
+
+		/** Ironlake PRM, Volume 4 Part 1, Section 5.10.2.1.2. */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned last_render_target:1;
+			unsigned msg_type:3;
+			unsigned send_commit_msg:1;
+			unsigned pad0:3;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} dp_write_gen5;
+
+		/**
+		 * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+		 *
+		 * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+		 **/
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:5;
+			unsigned msg_type:3;
+			unsigned pad0:3;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} gen6_dp_sampler_const_cache;
+
+		/**
+		 * Message for the Sandybridge Render Cache Data Port.
+		 *
+		 * Most fields are defined in the Sandybridge PRM, Volume 4 Part 1,
+		 * Section 3.9.2.1.1: Message Descriptor.
+		 *
+		 * "Slot Group Select" and "Last Render Target" are part of the
+		 * 5-bit message control for Render Target Write messages.  See
+		 * Section 3.9.9.2.1 of the same volume.
+		 */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned slot_group_select:1;
+			unsigned last_render_target:1;
+			unsigned msg_type:4;
+			unsigned send_commit_msg:1;
+			unsigned pad0:1;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad1:2;
+			unsigned end_of_thread:1;
+		} gen6_dp;
+
+		/**
+		 * Message for any of the Gen7 Data Port caches.
+		 *
+		 * Most fields are defined in BSpec volume 5c.2 Data Port / Messages /
+		 * Data Port Messages / Message Descriptor.  Once again, "Slot Group
+		 * Select" and "Last Render Target" are part of the 6-bit message
+		 * control for Render Target Writes.
+		 */
+		struct {
+			unsigned binding_table_index:8;
+			unsigned msg_control:3;
+			unsigned slot_group_select:1;
+			unsigned last_render_target:1;
+			unsigned msg_control_pad:1;
+			unsigned msg_type:4;
+			unsigned pad1:1;
+			unsigned header_present:1;
+			unsigned response_length:5;
+			unsigned msg_length:4;
+			unsigned pad2:2;
+			unsigned end_of_thread:1;
+		} gen7_dp;
+		/** @} */
+
+		struct {
+			unsigned src1_subreg_nr_high:1;
+			unsigned src1_reg_nr:8;
+			unsigned pad0:1;
+			unsigned src2_rep_ctrl:1;
+			unsigned src2_swizzle:8;
+			unsigned src2_subreg_nr:3;
+			unsigned src2_reg_nr:8;
+			unsigned pad1:2;
+		} da3src;
+
+		int d;
+		unsigned ud;
+		float f;
+	} bits3;
+};
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg {
+	unsigned type:4;
+	unsigned file:2;
+	unsigned nr:8;
+	unsigned subnr:5;		/* :1 in align16 */
+	unsigned negate:1;		/* source only */
+	unsigned abs:1;		/* source only */
+	unsigned vstride:4;		/* source only */
+	unsigned width:3;		/* src only, align1 only */
+	unsigned hstride:2;   		/* align1 only */
+	unsigned address_mode:1;	/* relative addressing, hopefully! */
+	unsigned pad0:1;
+
+	union {
+		struct {
+			unsigned swizzle:8;		/* src only, align16 only */
+			unsigned writemask:4;		/* dest only, align16 only */
+			int  indirect_offset:10;	/* relative addressing offset */
+			unsigned pad1:10;		/* two dwords total */
+		} bits;
+
+		float f;
+		int   d;
+		unsigned ud;
+	} dw1;
+};
+
+struct brw_indirect {
+	unsigned addr_subnr:4;
+	int addr_offset:10;
+	unsigned pad:18;
+};
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 10000
+
+struct brw_compile {
+	struct brw_instruction *store;
+	unsigned nr_insn;
+
+	int gen;
+
+	/* Allow clients to push/pop instruction state:
+	*/
+	struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+	bool compressed_stack[BRW_EU_MAX_INSN_STACK];
+	struct brw_instruction *current;
+
+	unsigned flag_value;
+	bool single_program_flow;
+	bool compressed;
+
+	/* Control flow stacks:
+	 * - if_stack contains IF and ELSE instructions which must be patched
+	 *   (and popped) once the matching ENDIF instruction is encountered.
+	 */
+	struct brw_instruction **if_stack;
+	int if_stack_depth;
+	int if_stack_array_size;
+};
+
+static inline int type_sz(unsigned type)
+{
+	switch (type) {
+	case BRW_REGISTER_TYPE_UD:
+	case BRW_REGISTER_TYPE_D:
+	case BRW_REGISTER_TYPE_F:
+		return 4;
+	case BRW_REGISTER_TYPE_HF:
+	case BRW_REGISTER_TYPE_UW:
+	case BRW_REGISTER_TYPE_W:
+		return 2;
+	case BRW_REGISTER_TYPE_UB:
+	case BRW_REGISTER_TYPE_B:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file  one of the BRW_x_REGISTER_FILE values
+ * \param nr  register number/index
+ * \param subnr  register sub number
+ * \param type  one of BRW_REGISTER_TYPE_x
+ * \param vstride  one of BRW_VERTICAL_STRIDE_x
+ * \param width  one of BRW_WIDTH_x
+ * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle  one of BRW_SWIZZLE_x
+ * \param writemask  WRITEMASK_X/Y/Z/W bitfield
+ */
+static inline struct brw_reg brw_reg(unsigned file,
+				     unsigned nr,
+				     unsigned subnr,
+				     unsigned type,
+				     unsigned vstride,
+				     unsigned width,
+				     unsigned hstride,
+				     unsigned swizzle,
+				     unsigned writemask)
+{
+	struct brw_reg reg;
+	if (file == BRW_GENERAL_REGISTER_FILE)
+		assert(nr < BRW_MAX_GRF);
+	else if (file == BRW_MESSAGE_REGISTER_FILE)
+		assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
+	else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
+		assert(nr <= BRW_ARF_IP);
+
+	reg.type = type;
+	reg.file = file;
+	reg.nr = nr;
+	reg.subnr = subnr * type_sz(type);
+	reg.negate = 0;
+	reg.abs = 0;
+	reg.vstride = vstride;
+	reg.width = width;
+	reg.hstride = hstride;
+	reg.address_mode = BRW_ADDRESS_DIRECT;
+	reg.pad0 = 0;
+
+	/* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+	 * set swizzle and writemask to W, as the lower bits of subnr will
+	 * be lost when converted to align16.  This is probably too much to
+	 * keep track of as you'd want it adjusted by suboffset(), etc.
+	 * Perhaps fix up when converting to align16?
+	 */
+	reg.dw1.bits.swizzle = swizzle;
+	reg.dw1.bits.writemask = writemask;
+	reg.dw1.bits.indirect_offset = 0;
+	reg.dw1.bits.pad1 = 0;
+	return reg;
+}
+
+/** Construct float[16] register */
+static inline struct brw_reg brw_vec16_reg(unsigned file,
+					   unsigned nr,
+					   unsigned subnr)
+{
+	return brw_reg(file,
+		       nr,
+		       subnr,
+		       BRW_REGISTER_TYPE_F,
+		       BRW_VERTICAL_STRIDE_16,
+		       BRW_WIDTH_16,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYZW,
+		       WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static inline struct brw_reg brw_vec8_reg(unsigned file,
+					  unsigned nr,
+					  unsigned subnr)
+{
+	return brw_reg(file,
+		       nr,
+		       subnr,
+		       BRW_REGISTER_TYPE_F,
+		       BRW_VERTICAL_STRIDE_8,
+		       BRW_WIDTH_8,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYZW,
+		       WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static inline struct brw_reg brw_vec4_reg(unsigned file,
+					  unsigned nr,
+					  unsigned subnr)
+{
+	return brw_reg(file,
+		       nr,
+		       subnr,
+		       BRW_REGISTER_TYPE_F,
+		       BRW_VERTICAL_STRIDE_4,
+		       BRW_WIDTH_4,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYZW,
+		       WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static inline struct brw_reg brw_vec2_reg(unsigned file,
+					  unsigned nr,
+					  unsigned subnr)
+{
+	return brw_reg(file,
+		       nr,
+		       subnr,
+		       BRW_REGISTER_TYPE_F,
+		       BRW_VERTICAL_STRIDE_2,
+		       BRW_WIDTH_2,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYXY,
+		       WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static inline struct brw_reg brw_vec1_reg(unsigned file,
+					  unsigned nr,
+					  unsigned subnr)
+{
+	return brw_reg(file,
+		       nr,
+		       subnr,
+		       BRW_REGISTER_TYPE_F,
+		       BRW_VERTICAL_STRIDE_0,
+		       BRW_WIDTH_1,
+		       BRW_HORIZONTAL_STRIDE_0,
+		       BRW_SWIZZLE_XXXX,
+		       WRITEMASK_X);
+}
+
+
+static inline struct brw_reg __retype(struct brw_reg reg,
+				      unsigned type)
+{
+	reg.type = type;
+	return reg;
+}
+
+static inline struct brw_reg __retype_d(struct brw_reg reg)
+{
+	return __retype(reg, BRW_REGISTER_TYPE_D);
+}
+
+static inline struct brw_reg __retype_ud(struct brw_reg reg)
+{
+	return __retype(reg, BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg __retype_uw(struct brw_reg reg)
+{
+	return __retype(reg, BRW_REGISTER_TYPE_UW);
+}
+
+static inline struct brw_reg __sechalf(struct brw_reg reg)
+{
+	if (reg.vstride)
+		reg.nr++;
+	return reg;
+}
+
+static inline struct brw_reg __suboffset(struct brw_reg reg,
+					 unsigned delta)
+{
+	reg.subnr += delta * type_sz(reg.type);
+	return reg;
+}
+
+static inline struct brw_reg __offset(struct brw_reg reg,
+				      unsigned delta)
+{
+	reg.nr += delta;
+	return reg;
+}
+
+static inline struct brw_reg byte_offset(struct brw_reg reg,
+					 unsigned bytes)
+{
+	unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+	reg.nr = newoffset / REG_SIZE;
+	reg.subnr = newoffset % REG_SIZE;
+	return reg;
+}
+
+
+/** Construct unsigned word[16] register */
+static inline struct brw_reg brw_uw16_reg(unsigned file,
+					  unsigned nr,
+					  unsigned subnr)
+{
+	return __suboffset(__retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static inline struct brw_reg brw_uw8_reg(unsigned file,
+					 unsigned nr,
+					 unsigned subnr)
+{
+	return __suboffset(__retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static inline struct brw_reg brw_uw1_reg(unsigned file,
+					 unsigned nr,
+					 unsigned subnr)
+{
+	return __suboffset(__retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static inline struct brw_reg brw_imm_reg(unsigned type)
+{
+	return brw_reg( BRW_IMMEDIATE_VALUE,
+			0,
+			0,
+			type,
+			BRW_VERTICAL_STRIDE_0,
+			BRW_WIDTH_1,
+			BRW_HORIZONTAL_STRIDE_0,
+			0,
+			0);
+}
+
+/** Construct float immediate register */
+static inline struct brw_reg brw_imm_f(float f)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+	imm.dw1.f = f;
+	return imm;
+}
+
+/** Construct integer immediate register */
+static inline struct brw_reg brw_imm_d(int d)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+	imm.dw1.d = d;
+	return imm;
+}
+
+/** Construct uint immediate register */
+static inline struct brw_reg brw_imm_ud(unsigned ud)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+	imm.dw1.ud = ud;
+	return imm;
+}
+
+/** Construct ushort immediate register */
+static inline struct brw_reg brw_imm_uw(uint16_t uw)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+	imm.dw1.ud = uw | (uw << 16);
+	return imm;
+}
+
+/** Construct short immediate register */
+static inline struct brw_reg brw_imm_w(int16_t w)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+	imm.dw1.d = w | (w << 16);
+	return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static inline struct brw_reg brw_imm_v(unsigned v)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+	imm.vstride = BRW_VERTICAL_STRIDE_0;
+	imm.width = BRW_WIDTH_8;
+	imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+	imm.dw1.ud = v;
+	return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static inline struct brw_reg brw_imm_vf(unsigned v)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+	imm.vstride = BRW_VERTICAL_STRIDE_0;
+	imm.width = BRW_WIDTH_4;
+	imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+	imm.dw1.ud = v;
+	return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static inline struct brw_reg brw_imm_vf4(unsigned v0,
+					 unsigned v1,
+					 unsigned v2,
+					 unsigned v3)
+{
+	struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+	imm.vstride = BRW_VERTICAL_STRIDE_0;
+	imm.width = BRW_WIDTH_4;
+	imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+	imm.dw1.ud = ((v0 << 0) |
+		      (v1 << 8) |
+		      (v2 << 16) |
+		      (v3 << 24));
+	return imm;
+}
+
+static inline struct brw_reg brw_address(struct brw_reg reg)
+{
+	return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static inline struct brw_reg brw_vec1_grf(unsigned nr, unsigned subnr)
+{
+	return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static inline struct brw_reg brw_vec2_grf(unsigned nr, unsigned subnr)
+{
+	return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static inline struct brw_reg brw_vec4_grf(unsigned nr, unsigned subnr)
+{
+	return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static inline struct brw_reg brw_vec8_grf(unsigned nr, unsigned subnr)
+{
+	return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg brw_uw8_grf(unsigned nr, unsigned subnr)
+{
+	return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg brw_uw16_grf(unsigned nr, unsigned subnr)
+{
+	return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct null register (usually used for setting condition codes) */
+static inline struct brw_reg brw_null_reg(void)
+{
+	return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+			    BRW_ARF_NULL,
+			    0);
+}
+
+static inline struct brw_reg brw_address_reg(unsigned subnr)
+{
+	return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+			   BRW_ARF_ADDRESS,
+			   subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static inline struct brw_reg brw_ip_reg(void)
+{
+	return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_IP,
+		       0,
+		       BRW_REGISTER_TYPE_UD,
+		       BRW_VERTICAL_STRIDE_4, /* ? */
+		       BRW_WIDTH_1,
+		       BRW_HORIZONTAL_STRIDE_0,
+		       BRW_SWIZZLE_XYZW, /* NOTE! */
+		       WRITEMASK_XYZW); /* NOTE! */
+}
+
+static inline struct brw_reg brw_acc_reg(void)
+{
+	return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+			    BRW_ARF_ACCUMULATOR,
+			    0);
+}
+
+static inline struct brw_reg brw_notification_1_reg(void)
+{
+	return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_NOTIFICATION_COUNT,
+		       1,
+		       BRW_REGISTER_TYPE_UD,
+		       BRW_VERTICAL_STRIDE_0,
+		       BRW_WIDTH_1,
+		       BRW_HORIZONTAL_STRIDE_0,
+		       BRW_SWIZZLE_XXXX,
+		       WRITEMASK_X);
+}
+
+static inline struct brw_reg brw_flag_reg(void)
+{
+	return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+			   BRW_ARF_FLAG,
+			   0);
+}
+
+static inline struct brw_reg brw_mask_reg(unsigned subnr)
+{
+	return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+			   BRW_ARF_MASK,
+			   subnr);
+}
+
+static inline struct brw_reg brw_message_reg(unsigned nr)
+{
+	assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
+	return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0);
+}
+
+static inline struct brw_reg brw_message4_reg(unsigned nr,
+					      int subnr)
+{
+	assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
+	return brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE, nr, subnr);
+}
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static inline unsigned cvt(unsigned val)
+{
+	switch (val) {
+	case 0: return 0;
+	case 1: return 1;
+	case 2: return 2;
+	case 4: return 3;
+	case 8: return 4;
+	case 16: return 5;
+	case 32: return 6;
+	}
+	return 0;
+}
+
+static inline struct brw_reg __stride(struct brw_reg reg,
+				    unsigned vstride,
+				    unsigned width,
+				    unsigned hstride)
+{
+	reg.vstride = cvt(vstride);
+	reg.width = cvt(width) - 1;
+	reg.hstride = cvt(hstride);
+	return reg;
+}
+
+static inline struct brw_reg vec16(struct brw_reg reg)
+{
+	return __stride(reg, 16,16,1);
+}
+
+static inline struct brw_reg vec8(struct brw_reg reg)
+{
+	return __stride(reg, 8,8,1);
+}
+
+static inline struct brw_reg vec4(struct brw_reg reg)
+{
+	return __stride(reg, 4,4,1);
+}
+
+static inline struct brw_reg vec2(struct brw_reg reg)
+{
+	return __stride(reg, 2,2,1);
+}
+
+static inline struct brw_reg vec1(struct brw_reg reg)
+{
+	return __stride(reg, 0,1,0);
+}
+
+static inline struct brw_reg get_element(struct brw_reg reg, unsigned elt)
+{
+	return vec1(__suboffset(reg, elt));
+}
+
+static inline struct brw_reg get_element_ud(struct brw_reg reg, unsigned elt)
+{
+	return vec1(__suboffset(__retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+static inline struct brw_reg brw_swizzle(struct brw_reg reg,
+					 unsigned x,
+					 unsigned y,
+					 unsigned z,
+					 unsigned w)
+{
+	assert(reg.file != BRW_IMMEDIATE_VALUE);
+
+	reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+					    BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+					    BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+					    BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+	return reg;
+}
+
+static inline struct brw_reg brw_swizzle1(struct brw_reg reg,
+					  unsigned x)
+{
+	return brw_swizzle(reg, x, x, x, x);
+}
+
+static inline struct brw_reg brw_writemask(struct brw_reg reg,
+					   unsigned mask)
+{
+	assert(reg.file != BRW_IMMEDIATE_VALUE);
+	reg.dw1.bits.writemask &= mask;
+	return reg;
+}
+
+static inline struct brw_reg brw_set_writemask(struct brw_reg reg,
+					       unsigned mask)
+{
+	assert(reg.file != BRW_IMMEDIATE_VALUE);
+	reg.dw1.bits.writemask = mask;
+	return reg;
+}
+
+static inline struct brw_reg brw_negate(struct brw_reg reg)
+{
+	reg.negate ^= 1;
+	return reg;
+}
+
+static inline struct brw_reg brw_abs(struct brw_reg reg)
+{
+	reg.abs = 1;
+	return reg;
+}
+
+/***********************************************************************
+*/
+static inline struct brw_reg brw_vec4_indirect(unsigned subnr,
+					       int offset)
+{
+	struct brw_reg reg =  brw_vec4_grf(0, 0);
+	reg.subnr = subnr;
+	reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+	reg.dw1.bits.indirect_offset = offset;
+	return reg;
+}
+
+static inline struct brw_reg brw_vec1_indirect(unsigned subnr,
+					       int offset)
+{
+	struct brw_reg reg =  brw_vec1_grf(0, 0);
+	reg.subnr = subnr;
+	reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+	reg.dw1.bits.indirect_offset = offset;
+	return reg;
+}
+
+static inline struct brw_reg deref_4f(struct brw_indirect ptr, int offset)
+{
+	return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg deref_1f(struct brw_indirect ptr, int offset)
+{
+	return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg deref_4b(struct brw_indirect ptr, int offset)
+{
+	return __retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static inline struct brw_reg deref_1uw(struct brw_indirect ptr, int offset)
+{
+	return __retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static inline struct brw_reg deref_1d(struct brw_indirect ptr, int offset)
+{
+	return __retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static inline struct brw_reg deref_1ud(struct brw_indirect ptr, int offset)
+{
+	return __retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+	return brw_address_reg(ptr.addr_subnr);
+}
+
+static inline struct brw_indirect brw_indirect_offset(struct brw_indirect ptr, int offset)
+{
+	ptr.addr_offset += offset;
+	return ptr;
+}
+
+static inline struct brw_indirect brw_indirect(unsigned addr_subnr, int offset)
+{
+	struct brw_indirect ptr;
+	ptr.addr_subnr = addr_subnr;
+	ptr.addr_offset = offset;
+	ptr.pad = 0;
+	return ptr;
+}
+
+/** Do two brw_regs refer to the same register? */
+static inline bool brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+	return r1.file == r2.file && r1.nr == r2.nr;
+}
+
+static inline struct brw_instruction *current_insn( struct brw_compile *p)
+{
+	return &p->store[p->nr_insn];
+}
+
+static inline void brw_set_predicate_control( struct brw_compile *p, unsigned pc )
+{
+	p->current->header.predicate_control = pc;
+}
+
+static inline void brw_set_predicate_inverse(struct brw_compile *p, bool predicate_inverse)
+{
+	p->current->header.predicate_inverse = predicate_inverse;
+}
+
+static inline void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional )
+{
+	p->current->header.destreg__conditionalmod = conditional;
+}
+
+static inline void brw_set_access_mode(struct brw_compile *p, unsigned access_mode)
+{
+	p->current->header.access_mode = access_mode;
+}
+
+static inline void brw_set_mask_control(struct brw_compile *p, unsigned value)
+{
+	p->current->header.mask_control = value;
+}
+
+static inline void brw_set_saturate(struct brw_compile *p, unsigned value)
+{
+	p->current->header.saturate = value;
+}
+
+static inline void brw_set_acc_write_control(struct brw_compile *p, unsigned value)
+{
+	if (p->gen >= 60)
+		p->current->header.acc_wr_control = value;
+}
+
+void brw_pop_insn_state(struct brw_compile *p);
+void brw_push_insn_state(struct brw_compile *p);
+void brw_set_compression_control(struct brw_compile *p, enum brw_compression control);
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value );
+
+void brw_compile_init(struct brw_compile *p, int gen, void *store);
+
+void brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+		  struct brw_reg dest);
+void brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+		  struct brw_reg reg);
+void brw_set_src1(struct brw_compile *p,
+		  struct brw_instruction *insn,
+		  struct brw_reg reg);
+
+void gen6_resolve_implied_move(struct brw_compile *p,
+			       struct brw_reg *src,
+			       unsigned msg_reg_nr);
+
+static inline struct brw_instruction *
+brw_next_insn(struct brw_compile *p, unsigned opcode)
+{
+	struct brw_instruction *insn;
+
+	assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+	insn = &p->store[p->nr_insn++];
+	*insn = *p->current;
+
+	if (p->current->header.destreg__conditionalmod) {
+		p->current->header.destreg__conditionalmod = 0;
+		p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+	}
+
+	insn->header.opcode = opcode;
+	return insn;
+}
+
+/* Helpers for regular instructions: */
+#define ALU1(OP)							\
+static inline struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+					       struct brw_reg dest,	\
+					       struct brw_reg src0)	\
+{									\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);			\
+}
+
+#define ALU2(OP)							\
+static inline struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+					       struct brw_reg dest,	\
+					       struct brw_reg src0,	\
+						struct brw_reg src1)	\
+{									\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);		\
+}
+
+/* Rounding operations (other than RNDD) require two instructions - the first
+ * stores a rounded value (possibly the wrong way) in the dest register, but
+ * also sets a per-channel "increment bit" in the flag register.  A predicated
+ * add of 1.0 fixes dest to contain the desired result.
+ *
+ * Sandybridge and later appear to round correctly without an ADD.
+ */
+#define ROUND(OP)							\
+static inline void brw_##OP(struct brw_compile *p,			\
+			    struct brw_reg dest,			\
+			    struct brw_reg src)				\
+{									\
+	struct brw_instruction *rnd, *add;				\
+	rnd = brw_next_insn(p, BRW_OPCODE_##OP);			\
+	brw_set_dest(p, rnd, dest);					\
+	brw_set_src0(p, rnd, src);					\
+	if (p->gen < 60) {						\
+		/* turn on round-increments */				\
+		rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
+		add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));		\
+		add->header.predicate_control = BRW_PREDICATE_NORMAL;	\
+	}								\
+}
+
+static inline struct brw_instruction *brw_alu1(struct brw_compile *p,
+					       unsigned opcode,
+					       struct brw_reg dest,
+					       struct brw_reg src)
+{
+	struct brw_instruction *insn = brw_next_insn(p, opcode);
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src);
+	return insn;
+}
+
+static inline struct brw_instruction *brw_alu2(struct brw_compile *p,
+					       unsigned opcode,
+					       struct brw_reg dest,
+					       struct brw_reg src0,
+					       struct brw_reg src1 )
+{
+	struct brw_instruction *insn = brw_next_insn(p, opcode);
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, src1);
+	return insn;
+}
+
+static inline struct brw_instruction *brw_ADD(struct brw_compile *p,
+					      struct brw_reg dest,
+					      struct brw_reg src0,
+					      struct brw_reg src1)
+{
+	/* 6.2.2: add */
+	if (src0.type == BRW_REGISTER_TYPE_F ||
+	    (src0.file == BRW_IMMEDIATE_VALUE &&
+	     src0.type == BRW_REGISTER_TYPE_VF)) {
+		assert(src1.type != BRW_REGISTER_TYPE_UD);
+		assert(src1.type != BRW_REGISTER_TYPE_D);
+	}
+
+	if (src1.type == BRW_REGISTER_TYPE_F ||
+	    (src1.file == BRW_IMMEDIATE_VALUE &&
+	     src1.type == BRW_REGISTER_TYPE_VF)) {
+		assert(src0.type != BRW_REGISTER_TYPE_UD);
+		assert(src0.type != BRW_REGISTER_TYPE_D);
+	}
+
+	return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
+}
+
+static inline struct brw_instruction *brw_MUL(struct brw_compile *p,
+					      struct brw_reg dest,
+					      struct brw_reg src0,
+					      struct brw_reg src1)
+{
+	/* 6.32.38: mul */
+	if (src0.type == BRW_REGISTER_TYPE_D ||
+	    src0.type == BRW_REGISTER_TYPE_UD ||
+	    src1.type == BRW_REGISTER_TYPE_D ||
+	    src1.type == BRW_REGISTER_TYPE_UD) {
+		assert(dest.type != BRW_REGISTER_TYPE_F);
+	}
+
+	if (src0.type == BRW_REGISTER_TYPE_F ||
+	    (src0.file == BRW_IMMEDIATE_VALUE &&
+	     src0.type == BRW_REGISTER_TYPE_VF)) {
+		assert(src1.type != BRW_REGISTER_TYPE_UD);
+		assert(src1.type != BRW_REGISTER_TYPE_D);
+	}
+
+	if (src1.type == BRW_REGISTER_TYPE_F ||
+	    (src1.file == BRW_IMMEDIATE_VALUE &&
+	     src1.type == BRW_REGISTER_TYPE_VF)) {
+		assert(src0.type != BRW_REGISTER_TYPE_UD);
+		assert(src0.type != BRW_REGISTER_TYPE_D);
+	}
+
+	assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	       src0.nr != BRW_ARF_ACCUMULATOR);
+	assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	       src1.nr != BRW_ARF_ACCUMULATOR);
+
+	return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
+}
+
+static inline struct brw_instruction *brw_JMPI(struct brw_compile *p,
+					       struct brw_reg dest,
+					       struct brw_reg src0,
+					       struct brw_reg src1)
+{
+	struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+	insn->header.execution_size = 1;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.mask_control = BRW_MASK_DISABLE;
+
+	p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+	return insn;
+}
+
+
+ALU1(MOV);
+ALU2(SEL);
+ALU1(NOT);
+ALU2(AND);
+ALU2(OR);
+ALU2(XOR);
+ALU2(SHR);
+ALU2(SHL);
+ALU2(RSR);
+ALU2(RSL);
+ALU2(ASR);
+ALU1(FRC);
+ALU1(RNDD);
+ALU2(MAC);
+ALU2(MACH);
+ALU1(LZD);
+ALU2(DP4);
+ALU2(DPH);
+ALU2(DP3);
+ALU2(DP2);
+ALU2(LINE);
+ALU2(PLN);
+
+ROUND(RNDZ);
+ROUND(RNDE);
+
+#undef ALU1
+#undef ALU2
+#undef ROUND
+
+/* Helpers for SEND instruction */
+void brw_set_dp_read_message(struct brw_compile *p,
+			     struct brw_instruction *insn,
+			     unsigned binding_table_index,
+			     unsigned msg_control,
+			     unsigned msg_type,
+			     unsigned target_cache,
+			     unsigned msg_length,
+			     unsigned response_length);
+
+void brw_set_dp_write_message(struct brw_compile *p,
+			      struct brw_instruction *insn,
+			      unsigned binding_table_index,
+			      unsigned msg_control,
+			      unsigned msg_type,
+			      unsigned msg_length,
+			      bool header_present,
+			      bool last_render_target,
+			      unsigned response_length,
+			      bool end_of_thread,
+			      bool send_commit_msg);
+
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   bool used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   bool eot,
+		   bool writes_complete,
+		   unsigned offset,
+		   unsigned swizzle);
+
+void brw_ff_sync(struct brw_compile *p,
+		 struct brw_reg dest,
+		 unsigned msg_reg_nr,
+		 struct brw_reg src0,
+		 bool allocate,
+		 unsigned response_length,
+		 bool eot);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		  int dispatch_width,
+                  unsigned msg_reg_nr,
+                  struct brw_reg src0,
+                  unsigned msg_control,
+                  unsigned binding_table_index,
+                  unsigned msg_length,
+                  unsigned response_length,
+                  bool eot,
+                  bool header_present);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		bool header_present,
+		unsigned simd_mode);
+
+void brw_math_16(struct brw_compile *p,
+		 struct brw_reg dest,
+		 unsigned function,
+		 unsigned saturate,
+		 unsigned msg_reg_nr,
+		 struct brw_reg src,
+		 unsigned precision);
+
+void brw_math(struct brw_compile *p,
+	      struct brw_reg dest,
+	      unsigned function,
+	      unsigned saturate,
+	      unsigned msg_reg_nr,
+	      struct brw_reg src,
+	      unsigned data_type,
+	      unsigned precision);
+
+void brw_math2(struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1);
+
+void brw_oword_block_read(struct brw_compile *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index);
+
+void brw_oword_block_read_scratch(struct brw_compile *p,
+				  struct brw_reg dest,
+				  struct brw_reg mrf,
+				  int num_regs,
+				  unsigned offset);
+
+void brw_oword_block_write_scratch(struct brw_compile *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset);
+
+void brw_dword_scattered_read(struct brw_compile *p,
+			      struct brw_reg dest,
+			      struct brw_reg mrf,
+			      uint32_t bind_table_index);
+
+void brw_dp_READ_4_vs(struct brw_compile *p,
+		      struct brw_reg dest,
+		      unsigned location,
+		      unsigned bind_table_index);
+
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addrReg,
+			       unsigned offset,
+			       unsigned bind_table_index);
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p,
+			       unsigned execute_size);
+struct brw_instruction *gen6_IF(struct brw_compile *p, uint32_t conditional,
+				struct brw_reg src0, struct brw_reg src1);
+
+void brw_ELSE(struct brw_compile *p);
+void brw_ENDIF(struct brw_compile *p);
+
+/* DO/WHILE loops:
+*/
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+				  struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count);
+struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count);
+struct brw_instruction *gen6_CONT(struct brw_compile *p,
+				  struct brw_instruction *do_insn);
+/* Forward jumps:
+*/
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn);
+
+void brw_NOP(struct brw_compile *p);
+
+void brw_WAIT(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg(struct brw_reg reg);
+
+static inline void brw_math_invert(struct brw_compile *p,
+				   struct brw_reg dst,
+				   struct brw_reg src)
+{
+	brw_math(p,
+		 dst,
+		 BRW_MATH_FUNCTION_INV,
+		 BRW_MATH_SATURATE_NONE,
+		 0,
+		 src,
+		 BRW_MATH_PRECISION_FULL,
+		 BRW_MATH_DATA_VECTOR);
+}
+
+void brw_set_uip_jip(struct brw_compile *p);
+
+uint32_t brw_swap_cmod(uint32_t cmod);
+
+void brw_disasm(FILE *file,
+		const struct brw_instruction *inst,
+		int gen);
+
+#endif
diff --git a/cogl/driver/drm/brw/brw_eu_debug.c b/cogl/driver/drm/brw/brw_eu_debug.c
new file mode 100644
index 00000000..99453afd
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_eu_debug.c
@@ -0,0 +1,95 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#include "main/mtypes.h"
+#include "main/imports.h"
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   printf("%s%s", 
+	  hwreg.abs ? "abs/" : "",
+	  hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
+      printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
+      printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      printf("imm %f", hwreg.dw1.f);
+   }
+   else {
+      printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/cogl/driver/drm/brw/brw_eu_emit.c b/cogl/driver/drm/brw/brw_eu_emit.c
new file mode 100644
index 00000000..3f01ae7b
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_eu_emit.c
@@ -0,0 +1,2002 @@
+/*
+   Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+   Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+   develop this 3D driver.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice (including the
+   next paragraph) shall be included in all copies or substantial
+   portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+   IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "brw_eu.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size(struct brw_compile *p,
+				 struct brw_instruction *insn,
+				 struct brw_reg reg)
+{
+	if (reg.width == BRW_WIDTH_8 && p->compressed)
+		insn->header.execution_size = BRW_EXECUTE_16;
+	else
+		insn->header.execution_size = reg.width;
+}
+
+
+/**
+ * Prior to Sandybridge, the SEND instruction accepted non-MRF source
+ * registers, implicitly moving the operand to a message register.
+ *
+ * On Sandybridge, this is no longer the case.  This function performs the
+ * explicit move; it should be called before emitting a SEND instruction.
+ */
+void
+gen6_resolve_implied_move(struct brw_compile *p,
+			  struct brw_reg *src,
+			  unsigned msg_reg_nr)
+{
+	if (p->gen < 60)
+		return;
+
+	if (src->file == BRW_MESSAGE_REGISTER_FILE)
+		return;
+
+	if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		brw_MOV(p, __retype_ud(brw_message_reg(msg_reg_nr)), __retype_ud(*src));
+		brw_pop_insn_state(p);
+	}
+	*src = brw_message_reg(msg_reg_nr);
+}
+
+static void
+gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
+{
+	/* From the BSpec / ISA Reference / send - [DevIVB+]:
+	 * "The send with EOT should use register space R112-R127 for <src>. This is
+	 *  to enable loading of a new thread into the same slot while the message
+	 *  with EOT for current thread is pending dispatch."
+	 *
+	 * Since we're pretending to have 16 MRFs anyway, we may as well use the
+	 * registers required for messages with EOT.
+	 */
+	if (p->gen >= 70 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+		reg->file = BRW_GENERAL_REGISTER_FILE;
+		reg->nr += 111;
+	}
+}
+
+void
+brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
+	     struct brw_reg dest)
+{
+	if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
+	    dest.file != BRW_MESSAGE_REGISTER_FILE)
+		assert(dest.nr < 128);
+
+	gen7_convert_mrf_to_grf(p, &dest);
+
+	insn->bits1.da1.dest_reg_file = dest.file;
+	insn->bits1.da1.dest_reg_type = dest.type;
+	insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+	if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+		insn->bits1.da1.dest_reg_nr = dest.nr;
+
+		if (insn->header.access_mode == BRW_ALIGN_1) {
+			insn->bits1.da1.dest_subreg_nr = dest.subnr;
+			if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+				dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+			insn->bits1.da1.dest_horiz_stride = dest.hstride;
+		} else {
+			insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+			insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+			/* even ignored in da16, still need to set as '01' */
+			insn->bits1.da16.dest_horiz_stride = 1;
+		}
+	} else {
+		insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+		/* These are different sizes in align1 vs align16:
+		*/
+		if (insn->header.access_mode == BRW_ALIGN_1) {
+			insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+			if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+				dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+			insn->bits1.ia1.dest_horiz_stride = dest.hstride;
+		}
+		else {
+			insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+			/* even ignored in da16, still need to set as '01' */
+			insn->bits1.ia16.dest_horiz_stride = 1;
+		}
+	}
+
+	guess_execution_size(p, insn, dest);
+}
+
+static const int reg_type_size[8] = {
+	[0] = 4,
+	[1] = 4,
+	[2] = 2,
+	[3] = 2,
+	[4] = 1,
+	[5] = 1,
+	[7] = 4
+};
+
+static void
+validate_reg(struct brw_instruction *insn, struct brw_reg reg)
+{
+	int hstride_for_reg[] = {0, 1, 2, 4};
+	int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
+	int width_for_reg[] = {1, 2, 4, 8, 16};
+	int execsize_for_reg[] = {1, 2, 4, 8, 16};
+	int width, hstride, vstride, execsize;
+
+	if (reg.file == BRW_IMMEDIATE_VALUE) {
+		/* 3.3.6: Region Parameters.  Restriction: Immediate vectors
+		 * mean the destination has to be 128-bit aligned and the
+		 * destination horiz stride has to be a word.
+		 */
+		if (reg.type == BRW_REGISTER_TYPE_V) {
+			assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
+			       reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
+		}
+
+		return;
+	}
+
+	if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	    reg.file == BRW_ARF_NULL)
+		return;
+
+	hstride = hstride_for_reg[reg.hstride];
+
+	if (reg.vstride == 0xf) {
+		vstride = -1;
+	} else {
+		vstride = vstride_for_reg[reg.vstride];
+	}
+
+	width = width_for_reg[reg.width];
+
+	execsize = execsize_for_reg[insn->header.execution_size];
+
+	/* Restrictions from 3.3.10: Register Region Restrictions. */
+	/* 3. */
+	assert(execsize >= width);
+
+	/* 4. */
+	if (execsize == width && hstride != 0) {
+		assert(vstride == -1 || vstride == width * hstride);
+	}
+
+	/* 5. */
+	if (execsize == width && hstride == 0) {
+		/* no restriction on vstride. */
+	}
+
+	/* 6. */
+	if (width == 1) {
+		assert(hstride == 0);
+	}
+
+	/* 7. */
+	if (execsize == 1 && width == 1) {
+		assert(hstride == 0);
+		assert(vstride == 0);
+	}
+
+	/* 8. */
+	if (vstride == 0 && hstride == 0) {
+		assert(width == 1);
+	}
+
+	/* 10. Check destination issues. */
+}
+
+void
+brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
+	     struct brw_reg reg)
+{
+	if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+		assert(reg.nr < 128);
+
+	gen7_convert_mrf_to_grf(p, &reg);
+
+	validate_reg(insn, reg);
+
+	insn->bits1.da1.src0_reg_file = reg.file;
+	insn->bits1.da1.src0_reg_type = reg.type;
+	insn->bits2.da1.src0_abs = reg.abs;
+	insn->bits2.da1.src0_negate = reg.negate;
+	insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+	if (reg.file == BRW_IMMEDIATE_VALUE) {
+		insn->bits3.ud = reg.dw1.ud;
+
+		/* Required to set some fields in src1 as well:
+		*/
+		insn->bits1.da1.src1_reg_file = 0; /* arf */
+		insn->bits1.da1.src1_reg_type = reg.type;
+	} else {
+		if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+			if (insn->header.access_mode == BRW_ALIGN_1) {
+				insn->bits2.da1.src0_subreg_nr = reg.subnr;
+				insn->bits2.da1.src0_reg_nr = reg.nr;
+			} else {
+				insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+				insn->bits2.da16.src0_reg_nr = reg.nr;
+			}
+		} else {
+			insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+			if (insn->header.access_mode == BRW_ALIGN_1) {
+				insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
+			} else {
+				insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+			}
+		}
+
+		if (insn->header.access_mode == BRW_ALIGN_1) {
+			if (reg.width == BRW_WIDTH_1 &&
+			    insn->header.execution_size == BRW_EXECUTE_1) {
+				insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+				insn->bits2.da1.src0_width = BRW_WIDTH_1;
+				insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+			} else {
+				insn->bits2.da1.src0_horiz_stride = reg.hstride;
+				insn->bits2.da1.src0_width = reg.width;
+				insn->bits2.da1.src0_vert_stride = reg.vstride;
+			}
+		} else {
+			insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+			insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+			insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+			insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+			/* This is an oddity of the fact we're using the same
+			 * descriptions for registers in align_16 as align_1:
+			 */
+			if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+				insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+			else
+				insn->bits2.da16.src0_vert_stride = reg.vstride;
+		}
+	}
+}
+
+void brw_set_src1(struct brw_compile *p,
+		  struct brw_instruction *insn,
+		  struct brw_reg reg)
+{
+	assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+	assert(reg.nr < 128);
+
+	gen7_convert_mrf_to_grf(p, &reg);
+
+	validate_reg(insn, reg);
+
+	insn->bits1.da1.src1_reg_file = reg.file;
+	insn->bits1.da1.src1_reg_type = reg.type;
+	insn->bits3.da1.src1_abs = reg.abs;
+	insn->bits3.da1.src1_negate = reg.negate;
+
+	/* Only src1 can be immediate in two-argument instructions. */
+	assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+	if (reg.file == BRW_IMMEDIATE_VALUE) {
+		insn->bits3.ud = reg.dw1.ud;
+	} else {
+		/* This is a hardware restriction, which may or may not be lifted
+		 * in the future:
+		 */
+		assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+		/* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+		if (insn->header.access_mode == BRW_ALIGN_1) {
+			insn->bits3.da1.src1_subreg_nr = reg.subnr;
+			insn->bits3.da1.src1_reg_nr = reg.nr;
+		} else {
+			insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+			insn->bits3.da16.src1_reg_nr = reg.nr;
+		}
+
+		if (insn->header.access_mode == BRW_ALIGN_1) {
+			if (reg.width == BRW_WIDTH_1 &&
+			    insn->header.execution_size == BRW_EXECUTE_1) {
+				insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+				insn->bits3.da1.src1_width = BRW_WIDTH_1;
+				insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+			} else {
+				insn->bits3.da1.src1_horiz_stride = reg.hstride;
+				insn->bits3.da1.src1_width = reg.width;
+				insn->bits3.da1.src1_vert_stride = reg.vstride;
+			}
+		} else {
+			insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+			insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+			insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+			insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+			/* This is an oddity of the fact we're using the same
+			 * descriptions for registers in align_16 as align_1:
+			 */
+			if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+				insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+			else
+				insn->bits3.da16.src1_vert_stride = reg.vstride;
+		}
+	}
+}
+
+/**
+ * Set the Message Descriptor and Extended Message Descriptor fields
+ * for SEND messages.
+ *
+ * \note This zeroes out the Function Control bits, so it must be called
+ *       \b before filling out any message-specific data.  Callers can
+ *       choose not to fill in irrelevant bits; they will be zero.
+ */
+static void
+brw_set_message_descriptor(struct brw_compile *p,
+			   struct brw_instruction *inst,
+			   enum brw_message_target sfid,
+			   unsigned msg_length,
+			   unsigned response_length,
+			   bool header_present,
+			   bool end_of_thread)
+{
+	brw_set_src1(p, inst, brw_imm_d(0));
+
+	if (p->gen >= 50) {
+		inst->bits3.generic_gen5.header_present = header_present;
+		inst->bits3.generic_gen5.response_length = response_length;
+		inst->bits3.generic_gen5.msg_length = msg_length;
+		inst->bits3.generic_gen5.end_of_thread = end_of_thread;
+
+		if (p->gen >= 60) {
+			/* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
+			inst->header.destreg__conditionalmod = sfid;
+		} else {
+			/* Set Extended Message Descriptor (ex_desc) */
+			inst->bits2.send_gen5.sfid = sfid;
+			inst->bits2.send_gen5.end_of_thread = end_of_thread;
+		}
+	} else {
+		inst->bits3.generic.response_length = response_length;
+		inst->bits3.generic.msg_length = msg_length;
+		inst->bits3.generic.msg_target = sfid;
+		inst->bits3.generic.end_of_thread = end_of_thread;
+	}
+}
+
+
+static void brw_set_math_message(struct brw_compile *p,
+				 struct brw_instruction *insn,
+				 unsigned function,
+				 unsigned integer_type,
+				 bool low_precision,
+				 bool saturate,
+				 unsigned dataType)
+{
+	unsigned msg_length;
+	unsigned response_length;
+
+	/* Infer message length from the function */
+	switch (function) {
+	case BRW_MATH_FUNCTION_POW:
+	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+	case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+		msg_length = 2;
+		break;
+	default:
+		msg_length = 1;
+		break;
+	}
+
+	/* Infer response length from the function */
+	switch (function) {
+	case BRW_MATH_FUNCTION_SINCOS:
+	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+		response_length = 2;
+		break;
+	default:
+		response_length = 1;
+		break;
+	}
+
+	brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
+				   msg_length, response_length,
+				   false, false);
+	if (p->gen == 50) {
+		insn->bits3.math_gen5.function = function;
+		insn->bits3.math_gen5.int_type = integer_type;
+		insn->bits3.math_gen5.precision = low_precision;
+		insn->bits3.math_gen5.saturate = saturate;
+		insn->bits3.math_gen5.data_type = dataType;
+		insn->bits3.math_gen5.snapshot = 0;
+	} else {
+		insn->bits3.math.function = function;
+		insn->bits3.math.int_type = integer_type;
+		insn->bits3.math.precision = low_precision;
+		insn->bits3.math.saturate = saturate;
+		insn->bits3.math.data_type = dataType;
+	}
+}
+
+static void brw_set_ff_sync_message(struct brw_compile *p,
+				    struct brw_instruction *insn,
+				    bool allocate,
+				    unsigned response_length,
+				    bool end_of_thread)
+{
+	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+				   1, response_length,
+				   true, end_of_thread);
+	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
+	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
+	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
+	insn->bits3.urb_gen5.allocate = allocate;
+	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
+	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
+}
+
+static void brw_set_urb_message(struct brw_compile *p,
+				struct brw_instruction *insn,
+				bool allocate,
+				bool used,
+				unsigned msg_length,
+				unsigned response_length,
+				bool end_of_thread,
+				bool complete,
+				unsigned offset,
+				unsigned swizzle_control)
+{
+	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+				   msg_length, response_length, true, end_of_thread);
+	if (p->gen >= 70) {
+		insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
+		insn->bits3.urb_gen7.offset = offset;
+		assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+		insn->bits3.urb_gen7.swizzle_control = swizzle_control;
+		/* per_slot_offset = 0 makes it ignore offsets in message header */
+		insn->bits3.urb_gen7.per_slot_offset = 0;
+		insn->bits3.urb_gen7.complete = complete;
+	} else if (p->gen >= 50) {
+		insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
+		insn->bits3.urb_gen5.offset = offset;
+		insn->bits3.urb_gen5.swizzle_control = swizzle_control;
+		insn->bits3.urb_gen5.allocate = allocate;
+		insn->bits3.urb_gen5.used = used;	/* ? */
+		insn->bits3.urb_gen5.complete = complete;
+	} else {
+		insn->bits3.urb.opcode = 0;	/* ? */
+		insn->bits3.urb.offset = offset;
+		insn->bits3.urb.swizzle_control = swizzle_control;
+		insn->bits3.urb.allocate = allocate;
+		insn->bits3.urb.used = used;	/* ? */
+		insn->bits3.urb.complete = complete;
+	}
+}
+
+void
+brw_set_dp_write_message(struct brw_compile *p,
+			 struct brw_instruction *insn,
+			 unsigned binding_table_index,
+			 unsigned msg_control,
+			 unsigned msg_type,
+			 unsigned msg_length,
+			 bool header_present,
+			 bool last_render_target,
+			 unsigned response_length,
+			 bool end_of_thread,
+			 bool send_commit_msg)
+{
+	unsigned sfid;
+
+	if (p->gen >= 70) {
+		/* Use the Render Cache for RT writes; otherwise use the Data Cache */
+		if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
+			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+		else
+			sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+	} else if (p->gen >= 60) {
+		/* Use the render cache for all write messages. */
+		sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+	} else {
+		sfid = BRW_SFID_DATAPORT_WRITE;
+	}
+
+	brw_set_message_descriptor(p, insn, sfid,
+				   msg_length, response_length,
+				   header_present, end_of_thread);
+
+	if (p->gen >= 70) {
+		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+		insn->bits3.gen7_dp.msg_control = msg_control;
+		insn->bits3.gen7_dp.last_render_target = last_render_target;
+		insn->bits3.gen7_dp.msg_type = msg_type;
+	} else if (p->gen >= 60) {
+		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+		insn->bits3.gen6_dp.msg_control = msg_control;
+		insn->bits3.gen6_dp.last_render_target = last_render_target;
+		insn->bits3.gen6_dp.msg_type = msg_type;
+		insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
+	} else if (p->gen >= 50) {
+		insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
+		insn->bits3.dp_write_gen5.msg_control = msg_control;
+		insn->bits3.dp_write_gen5.last_render_target = last_render_target;
+		insn->bits3.dp_write_gen5.msg_type = msg_type;
+		insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
+	} else {
+		insn->bits3.dp_write.binding_table_index = binding_table_index;
+		insn->bits3.dp_write.msg_control = msg_control;
+		insn->bits3.dp_write.last_render_target = last_render_target;
+		insn->bits3.dp_write.msg_type = msg_type;
+		insn->bits3.dp_write.send_commit_msg = send_commit_msg;
+	}
+}
+
+void
+brw_set_dp_read_message(struct brw_compile *p,
+			struct brw_instruction *insn,
+			unsigned binding_table_index,
+			unsigned msg_control,
+			unsigned msg_type,
+			unsigned target_cache,
+			unsigned msg_length,
+			unsigned response_length)
+{
+	unsigned sfid;
+
+	if (p->gen >= 70) {
+		sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+	} else if (p->gen >= 60) {
+		if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
+			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+		else
+			sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
+	} else {
+		sfid = BRW_SFID_DATAPORT_READ;
+	}
+
+	brw_set_message_descriptor(p, insn, sfid,
+				   msg_length, response_length,
+				   true, false);
+
+	if (p->gen >= 70) {
+		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
+		insn->bits3.gen7_dp.msg_control = msg_control;
+		insn->bits3.gen7_dp.last_render_target = 0;
+		insn->bits3.gen7_dp.msg_type = msg_type;
+	} else if (p->gen >= 60) {
+		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
+		insn->bits3.gen6_dp.msg_control = msg_control;
+		insn->bits3.gen6_dp.last_render_target = 0;
+		insn->bits3.gen6_dp.msg_type = msg_type;
+		insn->bits3.gen6_dp.send_commit_msg = 0;
+	} else if (p->gen >= 50) {
+		insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
+		insn->bits3.dp_read_gen5.msg_control = msg_control;
+		insn->bits3.dp_read_gen5.msg_type = msg_type;
+		insn->bits3.dp_read_gen5.target_cache = target_cache;
+	} else if (p->gen >= 45) {
+		insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
+		insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
+		insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
+		insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
+	} else {
+		insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+		insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+		insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+		insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+	}
+}
+
+static void brw_set_sampler_message(struct brw_compile *p,
+                                    struct brw_instruction *insn,
+                                    unsigned binding_table_index,
+                                    unsigned sampler,
+                                    unsigned msg_type,
+                                    unsigned response_length,
+                                    unsigned msg_length,
+                                    bool header_present,
+                                    unsigned simd_mode)
+{
+	brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER,
+				   msg_length, response_length,
+				   header_present, false);
+
+	if (p->gen >= 70) {
+		insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
+		insn->bits3.sampler_gen7.sampler = sampler;
+		insn->bits3.sampler_gen7.msg_type = msg_type;
+		insn->bits3.sampler_gen7.simd_mode = simd_mode;
+	} else if (p->gen >= 50) {
+		insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
+		insn->bits3.sampler_gen5.sampler = sampler;
+		insn->bits3.sampler_gen5.msg_type = msg_type;
+		insn->bits3.sampler_gen5.simd_mode = simd_mode;
+	} else if (p->gen >= 45) {
+		insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
+		insn->bits3.sampler_g4x.sampler = sampler;
+		insn->bits3.sampler_g4x.msg_type = msg_type;
+	} else {
+		insn->bits3.sampler.binding_table_index = binding_table_index;
+		insn->bits3.sampler.sampler = sampler;
+		insn->bits3.sampler.msg_type = msg_type;
+		insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+	}
+}
+
+
+void brw_NOP(struct brw_compile *p)
+{
+	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_NOP);
+	brw_set_dest(p, insn, __retype_ud(brw_vec4_grf(0,0)));
+	brw_set_src0(p, insn, __retype_ud(brw_vec4_grf(0,0)));
+	brw_set_src1(p, insn, brw_imm_ud(0x0));
+}
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+static void
+push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
+{
+	p->if_stack[p->if_stack_depth] = inst;
+
+	p->if_stack_depth++;
+	if (p->if_stack_array_size <= p->if_stack_depth) {
+		p->if_stack_array_size *= 2;
+		p->if_stack = realloc(p->if_stack, sizeof(struct brw_instruction *)*p->if_stack_array_size);
+	}
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ */
+struct brw_instruction *
+brw_IF(struct brw_compile *p, unsigned execute_size)
+{
+	struct brw_instruction *insn;
+
+	insn = brw_next_insn(p, BRW_OPCODE_IF);
+
+	/* Override the defaults for this instruction: */
+	if (p->gen < 60) {
+		brw_set_dest(p, insn, brw_ip_reg());
+		brw_set_src0(p, insn, brw_ip_reg());
+		brw_set_src1(p, insn, brw_imm_d(0x0));
+	} else if (p->gen < 70) {
+		brw_set_dest(p, insn, brw_imm_w(0));
+		insn->bits1.branch_gen6.jump_count = 0;
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
+	} else {
+		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, brw_imm_ud(0));
+		insn->bits3.break_cont.jip = 0;
+		insn->bits3.break_cont.uip = 0;
+	}
+
+	insn->header.execution_size = execute_size;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+	insn->header.mask_control = BRW_MASK_ENABLE;
+	if (!p->single_program_flow)
+		insn->header.thread_control = BRW_THREAD_SWITCH;
+
+	p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+	push_if_stack(p, insn);
+	return insn;
+}
+
+/* This function is only used for gen6-style IF instructions with an
+ * embedded comparison (conditional modifier).  It is not used on gen7.
+ */
+struct brw_instruction *
+gen6_IF(struct brw_compile *p, uint32_t conditional,
+	struct brw_reg src0, struct brw_reg src1)
+{
+	struct brw_instruction *insn;
+
+	insn = brw_next_insn(p, BRW_OPCODE_IF);
+
+	brw_set_dest(p, insn, brw_imm_w(0));
+	if (p->compressed) {
+		insn->header.execution_size = BRW_EXECUTE_16;
+	} else {
+		insn->header.execution_size = BRW_EXECUTE_8;
+	}
+	insn->bits1.branch_gen6.jump_count = 0;
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, src1);
+
+	assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
+	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+	insn->header.destreg__conditionalmod = conditional;
+
+	if (!p->single_program_flow)
+		insn->header.thread_control = BRW_THREAD_SWITCH;
+
+	push_if_stack(p, insn);
+	return insn;
+}
+
+/**
+ * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
+ */
+static void
+convert_IF_ELSE_to_ADD(struct brw_compile *p,
+		       struct brw_instruction *if_inst,
+		       struct brw_instruction *else_inst)
+{
+	/* The next instruction (where the ENDIF would be, if it existed) */
+	struct brw_instruction *next_inst = &p->store[p->nr_insn];
+
+	assert(p->single_program_flow);
+	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
+	assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
+	assert(if_inst->header.execution_size == BRW_EXECUTE_1);
+
+	/* Convert IF to an ADD instruction that moves the instruction pointer
+	 * to the first instruction of the ELSE block.  If there is no ELSE
+	 * block, point to where ENDIF would be.  Reverse the predicate.
+	 *
+	 * There's no need to execute an ENDIF since we don't need to do any
+	 * stack operations, and if we're currently executing, we just want to
+	 * continue normally.
+	 */
+	if_inst->header.opcode = BRW_OPCODE_ADD;
+	if_inst->header.predicate_inverse = 1;
+
+	if (else_inst != NULL) {
+		/* Convert ELSE to an ADD instruction that points where the ENDIF
+		 * would be.
+		 */
+		else_inst->header.opcode = BRW_OPCODE_ADD;
+
+		if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
+		else_inst->bits3.ud = (next_inst - else_inst) * 16;
+	} else {
+		if_inst->bits3.ud = (next_inst - if_inst) * 16;
+	}
+}
+
+/**
+ * Patch IF and ELSE instructions with appropriate jump targets.
+ */
+static void
+patch_IF_ELSE(struct brw_compile *p,
+	      struct brw_instruction *if_inst,
+	      struct brw_instruction *else_inst,
+	      struct brw_instruction *endif_inst)
+{
+	unsigned br = 1;
+
+	assert(!p->single_program_flow);
+	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
+	assert(endif_inst != NULL);
+	assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
+
+	/* Jump count is for 64bit data chunk each, so one 128bit instruction
+	 * requires 2 chunks.
+	 */
+	if (p->gen >= 50)
+		br = 2;
+
+	assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
+	endif_inst->header.execution_size = if_inst->header.execution_size;
+
+	if (else_inst == NULL) {
+		/* Patch IF -> ENDIF */
+		if (p->gen < 60) {
+			/* Turn it into an IFF, which means no mask stack operations for
+			 * all-false and jumping past the ENDIF.
+			 */
+			if_inst->header.opcode = BRW_OPCODE_IFF;
+			if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
+			if_inst->bits3.if_else.pop_count = 0;
+			if_inst->bits3.if_else.pad0 = 0;
+		} else if (p->gen < 70) {
+			/* As of gen6, there is no IFF and IF must point to the ENDIF. */
+			if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
+		} else {
+			if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+			if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
+		}
+	} else {
+		else_inst->header.execution_size = if_inst->header.execution_size;
+
+		/* Patch IF -> ELSE */
+		if (p->gen < 60) {
+			if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
+			if_inst->bits3.if_else.pop_count = 0;
+			if_inst->bits3.if_else.pad0 = 0;
+		} else if (p->gen <= 70) {
+			if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
+		}
+
+		/* Patch ELSE -> ENDIF */
+		if (p->gen < 60) {
+			/* BRW_OPCODE_ELSE pre-gen6 should point just past the
+			 * matching ENDIF.
+			 */
+			else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
+			else_inst->bits3.if_else.pop_count = 1;
+			else_inst->bits3.if_else.pad0 = 0;
+		} else if (p->gen < 70) {
+			/* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
+			else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
+		} else {
+			/* The IF instruction's JIP should point just past the ELSE */
+			if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
+			/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+			if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
+			else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
+		}
+	}
+}
+
+void
+brw_ELSE(struct brw_compile *p)
+{
+	struct brw_instruction *insn;
+
+	insn = brw_next_insn(p, BRW_OPCODE_ELSE);
+
+	if (p->gen < 60) {
+		brw_set_dest(p, insn, brw_ip_reg());
+		brw_set_src0(p, insn, brw_ip_reg());
+		brw_set_src1(p, insn, brw_imm_d(0x0));
+	} else if (p->gen < 70) {
+		brw_set_dest(p, insn, brw_imm_w(0));
+		insn->bits1.branch_gen6.jump_count = 0;
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
+	} else {
+		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, brw_imm_ud(0));
+		insn->bits3.break_cont.jip = 0;
+		insn->bits3.break_cont.uip = 0;
+	}
+
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.mask_control = BRW_MASK_ENABLE;
+	if (!p->single_program_flow)
+		insn->header.thread_control = BRW_THREAD_SWITCH;
+
+	push_if_stack(p, insn);
+}
+
+void
+brw_ENDIF(struct brw_compile *p)
+{
+	struct brw_instruction *insn;
+	struct brw_instruction *else_inst = NULL;
+	struct brw_instruction *if_inst = NULL;
+
+	/* Pop the IF and (optional) ELSE instructions from the stack */
+	p->if_stack_depth--;
+	if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
+		else_inst = p->if_stack[p->if_stack_depth];
+		p->if_stack_depth--;
+	}
+	if_inst = p->if_stack[p->if_stack_depth];
+
+	if (p->single_program_flow) {
+		/* ENDIF is useless; don't bother emitting it. */
+		convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
+		return;
+	}
+
+	insn = brw_next_insn(p, BRW_OPCODE_ENDIF);
+
+	if (p->gen < 60) {
+		brw_set_dest(p, insn, __retype_ud(brw_vec4_grf(0,0)));
+		brw_set_src0(p, insn, __retype_ud(brw_vec4_grf(0,0)));
+		brw_set_src1(p, insn, brw_imm_d(0x0));
+	} else if (p->gen < 70) {
+		brw_set_dest(p, insn, brw_imm_w(0));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
+	} else {
+		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, brw_imm_ud(0));
+	}
+
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.mask_control = BRW_MASK_ENABLE;
+	insn->header.thread_control = BRW_THREAD_SWITCH;
+
+	/* Also pop item off the stack in the endif instruction: */
+	if (p->gen < 60) {
+		insn->bits3.if_else.jump_count = 0;
+		insn->bits3.if_else.pop_count = 1;
+		insn->bits3.if_else.pad0 = 0;
+	} else if (p->gen < 70) {
+		insn->bits1.branch_gen6.jump_count = 2;
+	} else {
+		insn->bits3.break_cont.jip = 2;
+	}
+	patch_IF_ELSE(p, if_inst, else_inst, insn);
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
+{
+	struct brw_instruction *insn;
+
+	insn = brw_next_insn(p, BRW_OPCODE_BREAK);
+	if (p->gen >= 60) {
+		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, brw_imm_d(0x0));
+	} else {
+		brw_set_dest(p, insn, brw_ip_reg());
+		brw_set_src0(p, insn, brw_ip_reg());
+		brw_set_src1(p, insn, brw_imm_d(0x0));
+		insn->bits3.if_else.pad0 = 0;
+		insn->bits3.if_else.pop_count = pop_count;
+	}
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.execution_size = BRW_EXECUTE_8;
+
+	return insn;
+}
+
+struct brw_instruction *gen6_CONT(struct brw_compile *p,
+				  struct brw_instruction *do_insn)
+{
+	struct brw_instruction *insn;
+
+	insn = brw_next_insn(p, BRW_OPCODE_CONTINUE);
+	brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+	brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+	brw_set_dest(p, insn, brw_ip_reg());
+	brw_set_src0(p, insn, brw_ip_reg());
+	brw_set_src1(p, insn, brw_imm_d(0x0));
+
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.execution_size = BRW_EXECUTE_8;
+	return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
+{
+	struct brw_instruction *insn;
+	insn = brw_next_insn(p, BRW_OPCODE_CONTINUE);
+	brw_set_dest(p, insn, brw_ip_reg());
+	brw_set_src0(p, insn, brw_ip_reg());
+	brw_set_src1(p, insn, brw_imm_d(0x0));
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.execution_size = BRW_EXECUTE_8;
+	/* insn->header.mask_control = BRW_MASK_DISABLE; */
+	insn->bits3.if_else.pad0 = 0;
+	insn->bits3.if_else.pop_count = pop_count;
+	return insn;
+}
+
+/* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
+{
+	if (p->gen >= 60 || p->single_program_flow) {
+		return &p->store[p->nr_insn];
+	} else {
+		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_DO);
+
+		/* Override the defaults for this instruction:
+		*/
+		brw_set_dest(p, insn, brw_null_reg());
+		brw_set_src0(p, insn, brw_null_reg());
+		brw_set_src1(p, insn, brw_null_reg());
+
+		insn->header.compression_control = BRW_COMPRESSION_NONE;
+		insn->header.execution_size = execute_size;
+		insn->header.predicate_control = BRW_PREDICATE_NONE;
+		/* insn->header.mask_control = BRW_MASK_ENABLE; */
+		/* insn->header.mask_control = BRW_MASK_DISABLE; */
+
+		return insn;
+	}
+}
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+                                  struct brw_instruction *do_insn)
+{
+	struct brw_instruction *insn;
+	unsigned br = 1;
+
+	if (p->gen >= 50)
+		br = 2;
+
+	if (p->gen >= 70) {
+		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
+
+		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, brw_imm_ud(0));
+		insn->bits3.break_cont.jip = br * (do_insn - insn);
+
+		insn->header.execution_size = BRW_EXECUTE_8;
+	} else if (p->gen >= 60) {
+		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
+
+		brw_set_dest(p, insn, brw_imm_w(0));
+		insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
+		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
+		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
+
+		insn->header.execution_size = BRW_EXECUTE_8;
+	} else {
+		if (p->single_program_flow) {
+			insn = brw_next_insn(p, BRW_OPCODE_ADD);
+
+			brw_set_dest(p, insn, brw_ip_reg());
+			brw_set_src0(p, insn, brw_ip_reg());
+			brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
+			insn->header.execution_size = BRW_EXECUTE_1;
+		} else {
+			insn = brw_next_insn(p, BRW_OPCODE_WHILE);
+
+			assert(do_insn->header.opcode == BRW_OPCODE_DO);
+
+			brw_set_dest(p, insn, brw_ip_reg());
+			brw_set_src0(p, insn, brw_ip_reg());
+			brw_set_src1(p, insn, brw_imm_d(0));
+
+			insn->header.execution_size = do_insn->header.execution_size;
+			insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+			insn->bits3.if_else.pop_count = 0;
+			insn->bits3.if_else.pad0 = 0;
+		}
+	}
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+	return insn;
+}
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn)
+{
+	struct brw_instruction *landing = &p->store[p->nr_insn];
+	unsigned jmpi = 1;
+
+	if (p->gen >= 50)
+		jmpi = 2;
+
+	assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+	assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
+
+	jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_CMP);
+
+	insn->header.destreg__conditionalmod = conditional;
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, src1);
+
+	/* Make it so that future instructions will use the computed flag
+	 * value until brw_set_predicate_control_flag_value() is called
+	 * again.  
+	 */
+	if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	    dest.nr == 0) {
+		p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+		p->flag_value = 0xff;
+	}
+}
+
+/* Issue 'wait' instruction for n1, host could program MMIO
+   to wake up thread. */
+void brw_WAIT(struct brw_compile *p)
+{
+	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_WAIT);
+	struct brw_reg src = brw_notification_1_reg();
+
+	brw_set_dest(p, insn, src);
+	brw_set_src0(p, insn, src);
+	brw_set_src1(p, insn, brw_null_reg());
+	insn->header.execution_size = 0; /* must */
+	insn->header.predicate_control = 0;
+	insn->header.compression_control = 0;
+}
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void brw_math(struct brw_compile *p,
+	      struct brw_reg dest,
+	      unsigned function,
+	      unsigned saturate,
+	      unsigned msg_reg_nr,
+	      struct brw_reg src,
+	      unsigned data_type,
+	      unsigned precision)
+{
+	if (p->gen >= 60) {
+		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_MATH);
+
+		assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+		assert(src.file == BRW_GENERAL_REGISTER_FILE);
+
+		assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+		assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+
+		/* Source modifiers are ignored for extended math instructions. */
+		assert(!src.negate);
+		assert(!src.abs);
+
+		if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
+		    function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+			assert(src.type == BRW_REGISTER_TYPE_F);
+		}
+
+		/* Math is the same ISA format as other opcodes, except that CondModifier
+		 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+		 */
+		insn->header.destreg__conditionalmod = function;
+		insn->header.saturate = saturate;
+
+		brw_set_dest(p, insn, dest);
+		brw_set_src0(p, insn, src);
+		brw_set_src1(p, insn, brw_null_reg());
+	} else {
+		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+		/* Example code doesn't set predicate_control for send
+		 * instructions.
+		 */
+		insn->header.predicate_control = 0;
+		insn->header.destreg__conditionalmod = msg_reg_nr;
+
+		brw_set_dest(p, insn, dest);
+		brw_set_src0(p, insn, src);
+		brw_set_math_message(p, insn, function,
+				     src.type == BRW_REGISTER_TYPE_D,
+				     precision,
+				     saturate,
+				     data_type);
+	}
+}
+
+/** Extended math function, float[8].
+ */
+void brw_math2(struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1)
+{
+	struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_MATH);
+
+	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
+	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
+	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+
+	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+	assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+	assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+
+	if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
+	    function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+		assert(src0.type == BRW_REGISTER_TYPE_F);
+		assert(src1.type == BRW_REGISTER_TYPE_F);
+	}
+
+	/* Source modifiers are ignored for extended math instructions. */
+	assert(!src0.negate);
+	assert(!src0.abs);
+	assert(!src1.negate);
+	assert(!src1.abs);
+
+	/* Math is the same ISA format as other opcodes, except that CondModifier
+	 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+	 */
+	insn->header.destreg__conditionalmod = function;
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, src1);
+}
+
+/**
+ * Extended math function, float[16].
+ * Use 2 send instructions.
+ */
+void brw_math_16(struct brw_compile *p,
+		 struct brw_reg dest,
+		 unsigned function,
+		 unsigned saturate,
+		 unsigned msg_reg_nr,
+		 struct brw_reg src,
+		 unsigned precision)
+{
+	struct brw_instruction *insn;
+
+	if (p->gen >= 60) {
+		insn = brw_next_insn(p, BRW_OPCODE_MATH);
+
+		/* Math is the same ISA format as other opcodes, except that CondModifier
+		 * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+		 */
+		insn->header.destreg__conditionalmod = function;
+		insn->header.saturate = saturate;
+
+		/* Source modifiers are ignored for extended math instructions. */
+		assert(!src.negate);
+		assert(!src.abs);
+
+		brw_set_dest(p, insn, dest);
+		brw_set_src0(p, insn, src);
+		brw_set_src1(p, insn, brw_null_reg());
+		return;
+	}
+
+	/* First instruction:
+	*/
+	brw_push_insn_state(p);
+	brw_set_predicate_control_flag_value(p, 0xff);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.destreg__conditionalmod = msg_reg_nr;
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src);
+	brw_set_math_message(p, insn, function,
+			     BRW_MATH_INTEGER_UNSIGNED,
+			     precision,
+			     saturate,
+			     BRW_MATH_DATA_VECTOR);
+
+	/* Second instruction:
+	*/
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+	insn->header.destreg__conditionalmod = msg_reg_nr+1;
+
+	brw_set_dest(p, insn, __offset(dest,1));
+	brw_set_src0(p, insn, src);
+	brw_set_math_message(p, insn, function,
+			     BRW_MATH_INTEGER_UNSIGNED,
+			     precision,
+			     saturate,
+			     BRW_MATH_DATA_VECTOR);
+
+	brw_pop_insn_state(p);
+}
+
+/**
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes).  Used for
+ * register spilling.
+ */
+void brw_oword_block_write_scratch(struct brw_compile *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset)
+{
+	uint32_t msg_control, msg_type;
+	int mlen;
+
+	if (p->gen >= 60)
+		offset /= 16;
+
+	mrf = __retype_ud(mrf);
+
+	if (num_regs == 1) {
+		msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+		mlen = 2;
+	} else {
+		msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+		mlen = 3;
+	}
+
+	/* Set up the message header.  This is g0, with g0.2 filled with
+	 * the offset.  We don't want to leave our offset around in g0 or
+	 * it'll screw up texture samples, so set it up inside the message
+	 * reg.
+	 */
+	{
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+		brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
+
+		/* set message header global offset field (reg 0, element 2) */
+		brw_MOV(p,
+			__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
+			brw_imm_ud(offset));
+
+		brw_pop_insn_state(p);
+	}
+
+	{
+		struct brw_reg dest;
+		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+		int send_commit_msg;
+		struct brw_reg src_header = __retype_uw(brw_vec8_grf(0, 0));
+
+		if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
+			insn->header.compression_control = BRW_COMPRESSION_NONE;
+			src_header = vec16(src_header);
+		}
+		assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+		insn->header.destreg__conditionalmod = mrf.nr;
+
+		/* Until gen6, writes followed by reads from the same location
+		 * are not guaranteed to be ordered unless write_commit is set.
+		 * If set, then a no-op write is issued to the destination
+		 * register to set a dependency, and a read from the destination
+		 * can be used to ensure the ordering.
+		 *
+		 * For gen6, only writes between different threads need ordering
+		 * protection.  Our use of DP writes is all about register
+		 * spilling within a thread.
+		 */
+		if (p->gen >= 60) {
+			dest = __retype_uw(vec16(brw_null_reg()));
+			send_commit_msg = 0;
+		} else {
+			dest = src_header;
+			send_commit_msg = 1;
+		}
+
+		brw_set_dest(p, insn, dest);
+		if (p->gen >= 60) {
+			brw_set_src0(p, insn, mrf);
+		} else {
+			brw_set_src0(p, insn, brw_null_reg());
+		}
+
+		if (p->gen >= 60)
+			msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+		else
+			msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+
+		brw_set_dp_write_message(p,
+					 insn,
+					 255, /* binding table index (255=stateless) */
+					 msg_control,
+					 msg_type,
+					 mlen,
+					 true, /* header_present */
+					 0, /* pixel scoreboard */
+					 send_commit_msg, /* response_length */
+					 0, /* eot */
+					 send_commit_msg);
+	}
+}
+
+
+/**
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes).  Used for register
+ * spilling.
+ */
+void
+brw_oword_block_read_scratch(struct brw_compile *p,
+			     struct brw_reg dest,
+			     struct brw_reg mrf,
+			     int num_regs,
+			     unsigned offset)
+{
+	uint32_t msg_control;
+	int rlen;
+
+	if (p->gen >= 60)
+		offset /= 16;
+
+	mrf = __retype_ud(mrf);
+	dest = __retype_uw(dest);
+
+	if (num_regs == 1) {
+		msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+		rlen = 1;
+	} else {
+		msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+		rlen = 2;
+	}
+
+	{
+		brw_push_insn_state(p);
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+		brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
+
+		/* set message header global offset field (reg 0, element 2) */
+		brw_MOV(p,
+			__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
+			brw_imm_ud(offset));
+
+		brw_pop_insn_state(p);
+	}
+
+	{
+		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+		assert(insn->header.predicate_control == 0);
+		insn->header.compression_control = BRW_COMPRESSION_NONE;
+		insn->header.destreg__conditionalmod = mrf.nr;
+
+		brw_set_dest(p, insn, dest); /* UW? */
+		if (p->gen >= 60) {
+			brw_set_src0(p, insn, mrf);
+		} else {
+			brw_set_src0(p, insn, brw_null_reg());
+		}
+
+		brw_set_dp_read_message(p,
+					insn,
+					255, /* binding table index (255=stateless) */
+					msg_control,
+					BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+					BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
+					1, /* msg_length */
+					rlen);
+	}
+}
+
+/**
+ * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ */
+void brw_oword_block_read(struct brw_compile *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index)
+{
+	struct brw_instruction *insn;
+
+	/* On newer hardware, offset is in units of owords. */
+	if (p->gen >= 60)
+		offset /= 16;
+
+	mrf = __retype_ud(mrf);
+
+	brw_push_insn_state(p);
+	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
+
+	/* set message header global offset field (reg 0, element 2) */
+	brw_MOV(p,
+		__retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, mrf.nr, 2)),
+		brw_imm_ud(offset));
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.destreg__conditionalmod = mrf.nr;
+
+	/* cast dest to a uword[8] vector */
+	dest = __retype_uw(vec8(dest));
+
+	brw_set_dest(p, insn, dest);
+	if (p->gen >= 60) {
+		brw_set_src0(p, insn, mrf);
+	} else {
+		brw_set_src0(p, insn, brw_null_reg());
+	}
+
+	brw_set_dp_read_message(p,
+				insn,
+				bind_table_index,
+				BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+				BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+				1, /* msg_length */
+				1); /* response_length (1 reg, 2 owords!) */
+
+	brw_pop_insn_state(p);
+}
+
+/**
+ * Read a set of dwords from the data port Data Cache (const buffer).
+ *
+ * Location (in buffer) appears as UD offsets in the register after
+ * the provided mrf header reg.
+ */
+void brw_dword_scattered_read(struct brw_compile *p,
+			      struct brw_reg dest,
+			      struct brw_reg mrf,
+			      uint32_t bind_table_index)
+{
+	struct brw_instruction *insn;
+
+	mrf = __retype_ud(mrf);
+
+	brw_push_insn_state(p);
+	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_MOV(p, mrf, __retype_ud(brw_vec8_grf(0, 0)));
+	brw_pop_insn_state(p);
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.destreg__conditionalmod = mrf.nr;
+
+	/* cast dest to a uword[8] vector */
+	dest = __retype_uw(vec8(dest));
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, brw_null_reg());
+
+	brw_set_dp_read_message(p,
+				insn,
+				bind_table_index,
+				BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
+				BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
+				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+				2, /* msg_length */
+				1); /* response_length */
+}
+
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      unsigned location,
+                      unsigned bind_table_index)
+{
+	struct brw_instruction *insn;
+	unsigned msg_reg_nr = 1;
+
+	if (p->gen >= 60)
+		location /= 16;
+
+	/* Setup MRF[1] with location/offset into const buffer */
+	brw_push_insn_state(p);
+	brw_set_access_mode(p, BRW_ALIGN_1);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	brw_MOV(p, __retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2)),
+		brw_imm_ud(location));
+	brw_pop_insn_state(p);
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+	insn->header.predicate_control = BRW_PREDICATE_NONE;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.destreg__conditionalmod = msg_reg_nr;
+	insn->header.mask_control = BRW_MASK_DISABLE;
+
+	brw_set_dest(p, insn, dest);
+	if (p->gen >= 60) {
+		brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
+	} else {
+		brw_set_src0(p, insn, brw_null_reg());
+	}
+
+	brw_set_dp_read_message(p,
+				insn,
+				bind_table_index,
+				0,
+				BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+				1, /* msg_length */
+				1); /* response_length (1 Oword) */
+}
+
+/**
+ * Read a float[4] constant per vertex from VS constant buffer, with
+ * relative addressing.
+ */
+void brw_dp_READ_4_vs_relative(struct brw_compile *p,
+			       struct brw_reg dest,
+			       struct brw_reg addr_reg,
+			       unsigned offset,
+			       unsigned bind_table_index)
+{
+	struct brw_reg src = brw_vec8_grf(0, 0);
+	struct brw_instruction *insn;
+	int msg_type;
+
+	/* Setup MRF[1] with offset into const buffer */
+	brw_push_insn_state(p);
+	brw_set_access_mode(p, BRW_ALIGN_1);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+	/* M1.0 is block offset 0, M1.4 is block offset 1, all other
+	 * fields ignored.
+	 */
+	brw_ADD(p, __retype_d(brw_message_reg(1)),
+		addr_reg, brw_imm_d(offset));
+	brw_pop_insn_state(p);
+
+	gen6_resolve_implied_move(p, &src, 0);
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.predicate_control = BRW_PREDICATE_NONE;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+	insn->header.destreg__conditionalmod = 0;
+	insn->header.mask_control = BRW_MASK_DISABLE;
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src);
+
+	if (p->gen >= 60)
+		msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+	else if (p->gen >= 45)
+		msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+	else
+		msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+	brw_set_dp_read_message(p,
+				insn,
+				bind_table_index,
+				BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+				msg_type,
+				BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+				2, /* msg_length */
+				1); /* response_length */
+}
+
+void brw_fb_WRITE(struct brw_compile *p,
+		  int dispatch_width,
+                  unsigned msg_reg_nr,
+                  struct brw_reg src0,
+                  unsigned msg_control,
+                  unsigned binding_table_index,
+                  unsigned msg_length,
+                  unsigned response_length,
+                  bool eot,
+                  bool header_present)
+{
+	struct brw_instruction *insn;
+	unsigned msg_type;
+	struct brw_reg dest;
+
+	if (dispatch_width == 16)
+		dest = __retype_uw(vec16(brw_null_reg()));
+	else
+		dest = __retype_uw(vec8(brw_null_reg()));
+
+	if (p->gen >= 60 && binding_table_index == 0) {
+		insn = brw_next_insn(p, BRW_OPCODE_SENDC);
+	} else {
+		insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	}
+	/* The execution mask is ignored for render target writes. */
+	insn->header.predicate_control = 0;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+	if (p->gen >= 60) {
+		/* headerless version, just submit color payload */
+		src0 = brw_message_reg(msg_reg_nr);
+
+		msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+	} else {
+		insn->header.destreg__conditionalmod = msg_reg_nr;
+
+		msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+	}
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_dp_write_message(p,
+				 insn,
+				 binding_table_index,
+				 msg_control,
+				 msg_type,
+				 msg_length,
+				 header_present,
+				 eot,
+				 response_length,
+				 eot,
+				 0 /* send_commit_msg */);
+}
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		bool header_present,
+		unsigned simd_mode)
+{
+	assert(writemask);
+
+	if (p->gen < 50 || writemask != WRITEMASK_XYZW) {
+		struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+
+		writemask = ~writemask & WRITEMASK_XYZW;
+
+		brw_push_insn_state(p);
+
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+		brw_MOV(p, __retype_ud(m1), __retype_ud(brw_vec8_grf(0,0)));
+		brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(writemask << 12));
+
+		brw_pop_insn_state(p);
+
+		src0 = __retype_uw(brw_null_reg());
+	}
+
+	{
+		struct brw_instruction *insn;
+
+		gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+		insn = brw_next_insn(p, BRW_OPCODE_SEND);
+		insn->header.predicate_control = 0; /* XXX */
+		insn->header.compression_control = BRW_COMPRESSION_NONE;
+		if (p->gen < 60)
+			insn->header.destreg__conditionalmod = msg_reg_nr;
+
+		brw_set_dest(p, insn, dest);
+		brw_set_src0(p, insn, src0);
+		brw_set_sampler_message(p, insn,
+					binding_table_index,
+					sampler,
+					msg_type,
+					response_length,
+					msg_length,
+					header_present,
+					simd_mode);
+	}
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   bool used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   bool eot,
+		   bool writes_complete,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+	struct brw_instruction *insn;
+
+	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+	if (p->gen >= 70) {
+		/* Enable Channel Masks in the URB_WRITE_HWORD message header */
+		brw_push_insn_state(p);
+		brw_set_access_mode(p, BRW_ALIGN_1);
+		brw_OR(p, __retype_ud(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5)),
+		       __retype_ud(brw_vec1_grf(0, 5)),
+		       brw_imm_ud(0xff00));
+		brw_pop_insn_state(p);
+	}
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+	assert(msg_length < BRW_MAX_MRF);
+
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, brw_imm_d(0));
+
+	if (p->gen <= 60)
+		insn->header.destreg__conditionalmod = msg_reg_nr;
+
+	brw_set_urb_message(p,
+			    insn,
+			    allocate,
+			    used,
+			    msg_length,
+			    response_length,
+			    eot,
+			    writes_complete,
+			    offset,
+			    swizzle);
+}
+
+static int
+brw_find_next_block_end(struct brw_compile *p, int start)
+{
+	int ip;
+
+	for (ip = start + 1; ip < p->nr_insn; ip++) {
+		struct brw_instruction *insn = &p->store[ip];
+
+		switch (insn->header.opcode) {
+		case BRW_OPCODE_ENDIF:
+		case BRW_OPCODE_ELSE:
+		case BRW_OPCODE_WHILE:
+			return ip;
+		}
+	}
+	assert(!"not reached");
+	return start + 1;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_compile *p, int start)
+{
+	int ip;
+	int br = 2;
+
+	for (ip = start + 1; ip < p->nr_insn; ip++) {
+		struct brw_instruction *insn = &p->store[ip];
+
+		if (insn->header.opcode == BRW_OPCODE_WHILE) {
+			int jip = p->gen <= 70 ? insn->bits1.branch_gen6.jump_count
+				: insn->bits3.break_cont.jip;
+			if (ip + jip / br <= start)
+				return ip;
+		}
+	}
+	assert(!"not reached");
+	return start + 1;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK and CONT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_compile *p)
+{
+	int ip;
+	int br = 2;
+
+	if (p->gen <= 60)
+		return;
+
+	for (ip = 0; ip < p->nr_insn; ip++) {
+		struct brw_instruction *insn = &p->store[ip];
+
+		switch (insn->header.opcode) {
+		case BRW_OPCODE_BREAK:
+			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+			/* Gen7 UIP points to WHILE; Gen6 points just after it */
+			insn->bits3.break_cont.uip =
+				br * (brw_find_loop_end(p, ip) - ip + (p->gen <= 70 ? 1 : 0));
+			break;
+		case BRW_OPCODE_CONTINUE:
+			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+			insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
+
+			assert(insn->bits3.break_cont.uip != 0);
+			assert(insn->bits3.break_cont.jip != 0);
+			break;
+		}
+	}
+}
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   unsigned response_length,
+		   bool eot)
+{
+	struct brw_instruction *insn;
+
+	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	brw_set_dest(p, insn, dest);
+	brw_set_src0(p, insn, src0);
+	brw_set_src1(p, insn, brw_imm_d(0));
+
+	if (p->gen < 60)
+		insn->header.destreg__conditionalmod = msg_reg_nr;
+
+	brw_set_ff_sync_message(p,
+				insn,
+				allocate,
+				response_length,
+				eot);
+}
diff --git a/cogl/driver/drm/brw/brw_eu_util.c b/cogl/driver/drm/brw/brw_eu_util.c
new file mode 100644
index 00000000..5405cf17
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count)
+{
+   GLuint i;
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/cogl/driver/drm/brw/brw_sf.c b/cogl/driver/drm/brw/brw_sf.c
new file mode 100644
index 00000000..6f821719
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_sf.c
@@ -0,0 +1,54 @@
+#include "brw.h"
+
+bool brw_sf_kernel__nomask(struct brw_compile *p)
+{
+	struct brw_reg inv, v0, v1, v2, delta;
+
+	v0 = brw_vec4_grf(3, 0);
+	v1 = brw_vec4_grf(4, 0);
+	v2 = brw_vec4_grf(5, 0);
+	delta = brw_vec8_grf(7, 0);
+
+	inv = brw_vec4_grf(6, 0);
+	brw_math_invert(p, inv, brw_vec4_grf(1, 11));
+
+	brw_MOV(p, brw_message_reg(3), v0);
+
+	brw_ADD(p, delta, v1, brw_negate(v2));
+	brw_MUL(p, brw_message_reg(1), delta, brw_vec1_grf(6,0));
+
+	brw_ADD(p, delta, v2, brw_negate(v0));
+	brw_MUL(p, brw_message_reg(2), delta, brw_vec1_grf(6,2));
+
+	brw_urb_WRITE(p, brw_null_reg(), 0, brw_vec8_grf(0 ,0),
+		      false, true, 4, 0, true, true, 0,
+		      BRW_URB_SWIZZLE_TRANSPOSE);
+
+	return true;
+}
+
+bool brw_sf_kernel__mask(struct brw_compile *p)
+{
+	struct brw_reg inv, v0, v1, v2;
+
+	v0 = brw_vec8_grf(3, 0);
+	v1 = brw_vec8_grf(4, 0);
+	v2 = brw_vec8_grf(5, 0);
+
+	inv = brw_vec4_grf(6, 0);
+	brw_math_invert(p, inv, brw_vec4_grf(1, 11));
+
+	brw_MOV(p, brw_message_reg(3), v0);
+
+	brw_ADD(p, brw_vec8_grf(7, 0), v1, brw_negate(v2));
+	brw_MUL(p, brw_message_reg(1), brw_vec8_grf(7, 0), brw_vec1_grf(6,0));
+
+	brw_ADD(p, brw_vec8_grf(7, 0), v2, brw_negate(v0));
+	brw_MUL(p, brw_message_reg(2), brw_vec8_grf(7, 0), brw_vec1_grf(6,2));
+
+	brw_urb_WRITE(p, brw_null_reg(), 0, brw_vec8_grf(0 ,0),
+		      false, true, 4, 0, true, true, 0,
+		      BRW_URB_SWIZZLE_TRANSPOSE);
+
+	return true;
+}
diff --git a/cogl/driver/drm/brw/brw_test.c b/cogl/driver/drm/brw/brw_test.c
new file mode 100644
index 00000000..4f038584
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "brw_test.h"
+#include <string.h>
+
+void brw_test_compare(const char *function, int gen,
+		      const struct brw_instruction *new, int num_new,
+		      const struct brw_instruction *old, int num_old)
+{
+	int n;
+
+	if (num_new != num_old ||
+	    memcmp(new, old, num_new * sizeof(struct brw_instruction))) {
+		printf ("%s: new\n", function);
+		for (n = 0; n < num_new; n++)
+			brw_disasm(stdout, &new[n], gen);
+
+		printf ("%s: old\n", function);
+		for (n = 0; n < num_old; n++)
+			brw_disasm(stdout, &old[n], gen);
+		printf ("\n");
+	}
+}
+
+
+/* Check that we can recreate all the existing programs using the assembler */
+int main(int argc, char **argv)
+{
+	brw_test_gen4();
+	brw_test_gen5();
+	brw_test_gen6();
+	brw_test_gen7();
+
+	return 0;
+}
diff --git a/cogl/driver/drm/brw/brw_test.h b/cogl/driver/drm/brw/brw_test.h
new file mode 100644
index 00000000..41f4ca6b
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef BRW_TEST_H
+#define BRW_TEST_H
+
+#include "brw.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+#endif
+
+void brw_test_compare(const char *function, int gen,
+		      const struct brw_instruction *new, int num_new,
+		      const struct brw_instruction *old, int num_old);
+
+void brw_test_gen4(void);
+void brw_test_gen5(void);
+void brw_test_gen6(void);
+void brw_test_gen7(void);
+
+#endif /* BRW_TEST_H */
diff --git a/cogl/driver/drm/brw/brw_test_gen4.c b/cogl/driver/drm/brw/brw_test_gen4.c
new file mode 100644
index 00000000..742c7c24
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test_gen4.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "brw_test.h"
+
+#include <string.h>
+
+static const uint32_t sf_kernel[][4] = {
+#include "exa_sf.g4b"
+};
+
+static const uint32_t sf_kernel_mask[][4] = {
+#include "exa_sf_mask.g4b"
+};
+
+static const uint32_t ps_kernel_nomask_affine[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_nomask_projective[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_maskca_affine[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_maskca_projective[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_affine[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_a.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca_srcalpha.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_projective[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_a.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_argb.g4b"
+#include "exa_wm_ca_srcalpha.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_masknoca_affine[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_affine.g4b"
+#include "exa_wm_mask_sample_a.g4b"
+#include "exa_wm_noca.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_masknoca_projective[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_projective.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_mask_projective.g4b"
+#include "exa_wm_mask_sample_a.g4b"
+#include "exa_wm_noca.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_packed_static[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_argb.g4b"
+#include "exa_wm_yuv_rgb.g4b"
+#include "exa_wm_write.g4b"
+};
+
+static const uint32_t ps_kernel_planar_static[][4] = {
+#include "exa_wm_xy.g4b"
+#include "exa_wm_src_affine.g4b"
+#include "exa_wm_src_sample_planar.g4b"
+#include "exa_wm_yuv_rgb.g4b"
+#include "exa_wm_write.g4b"
+};
+
+#define compare(old) brw_test_compare(__FUNCTION__, p.gen, p.store, p.nr_insn, (struct brw_instruction *)old, ARRAY_SIZE(old)-8)
+
+static void gen4_sf__nomask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 40, store);
+	brw_sf_kernel__nomask(&p);
+
+	compare(sf_kernel);
+}
+
+static void gen4_sf__mask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 40, store);
+	brw_sf_kernel__mask(&p);
+
+	compare(sf_kernel_mask);
+}
+
+static void
+gen4_wm_kernel__affine_nomask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 40, store);
+	brw_wm_kernel__affine(&p, 16);
+
+	compare(ps_kernel_nomask_affine);
+}
+
+static void
+gen4_wm_kernel__affine_mask_noca(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 40, store);
+	brw_wm_kernel__affine_mask(&p, 16);
+
+	compare(ps_kernel_masknoca_affine);
+}
+
+static void
+gen4_wm_kernel__projective_nomask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 40, store);
+	brw_wm_kernel__projective(&p, 16);
+
+	compare(ps_kernel_nomask_projective);
+}
+
+void brw_test_gen4(void)
+{
+	gen4_sf__nomask();
+	gen4_sf__mask();
+
+	gen4_wm_kernel__affine_nomask();
+	gen4_wm_kernel__affine_mask_noca();
+
+	gen4_wm_kernel__projective_nomask();
+}
diff --git a/cogl/driver/drm/brw/brw_test_gen5.c b/cogl/driver/drm/brw/brw_test_gen5.c
new file mode 100644
index 00000000..62a999e1
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test_gen5.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "brw_test.h"
+
+#include <string.h>
+
+static const uint32_t sf_kernel[][4] = {
+#include "exa_sf.g5b"
+};
+
+static const uint32_t sf_kernel_mask[][4] = {
+#include "exa_sf_mask.g5b"
+};
+
+static const uint32_t ps_kernel_nomask_affine[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_nomask_projective[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_projective.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_maskca_affine[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_mask_affine.g5b"
+#include "exa_wm_mask_sample_argb.g5b"
+#include "exa_wm_ca.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_maskca_projective[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_projective.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_mask_projective.g5b"
+#include "exa_wm_mask_sample_argb.g5b"
+#include "exa_wm_ca.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_affine[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_a.g5b"
+#include "exa_wm_mask_affine.g5b"
+#include "exa_wm_mask_sample_argb.g5b"
+#include "exa_wm_ca_srcalpha.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_projective[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_projective.g5b"
+#include "exa_wm_src_sample_a.g5b"
+#include "exa_wm_mask_projective.g5b"
+#include "exa_wm_mask_sample_argb.g5b"
+#include "exa_wm_ca_srcalpha.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_masknoca_affine[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_mask_affine.g5b"
+#include "exa_wm_mask_sample_a.g5b"
+#include "exa_wm_noca.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_masknoca_projective[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_projective.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_mask_projective.g5b"
+#include "exa_wm_mask_sample_a.g5b"
+#include "exa_wm_noca.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_packed_static[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_argb.g5b"
+#include "exa_wm_yuv_rgb.g5b"
+#include "exa_wm_write.g5b"
+};
+
+static const uint32_t ps_kernel_planar_static[][4] = {
+#include "exa_wm_xy.g5b"
+#include "exa_wm_src_affine.g5b"
+#include "exa_wm_src_sample_planar.g5b"
+#include "exa_wm_yuv_rgb.g5b"
+#include "exa_wm_write.g5b"
+};
+
+#define compare(old) brw_test_compare(__FUNCTION__, p.gen, p.store, p.nr_insn, (struct brw_instruction *)old, ARRAY_SIZE(old))
+
+static void gen5_sf(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_sf_kernel__nomask(&p);
+
+	compare(sf_kernel);
+}
+
+static void gen5_sf_mask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_sf_kernel__mask(&p);
+
+	compare(sf_kernel_mask);
+}
+
+static void gen5_wm_affine_nomask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_wm_kernel__affine(&p, 16);
+
+	compare(ps_kernel_nomask_affine);
+}
+
+static void gen5_wm_affine_mask_noca(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_wm_kernel__affine_mask(&p, 16);
+
+	compare(ps_kernel_masknoca_affine);
+}
+
+static void gen5_wm_affine_mask_ca(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_wm_kernel__affine_mask_ca(&p, 16);
+
+	compare(ps_kernel_maskca_affine);
+}
+
+static void gen5_wm_projective_nomask(void)
+{
+	uint32_t store[128];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 50, store);
+	brw_wm_kernel__projective(&p, 16);
+
+	compare(ps_kernel_nomask_projective);
+}
+
+void brw_test_gen5(void)
+{
+	gen5_sf();
+	gen5_sf_mask();
+
+	gen5_wm_affine_nomask();
+	gen5_wm_affine_mask_noca();
+	gen5_wm_affine_mask_ca();
+
+	gen5_wm_projective_nomask();
+}
diff --git a/cogl/driver/drm/brw/brw_test_gen6.c b/cogl/driver/drm/brw/brw_test_gen6.c
new file mode 100644
index 00000000..64bc2fb1
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test_gen6.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "brw_test.h"
+
+#include <string.h>
+
+static const uint32_t ps_kernel_nomask_affine[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_nomask_projective[][4] = {
+#include "exa_wm_src_projective.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_maskca_affine[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_mask_affine.g6b"
+#include "exa_wm_mask_sample_argb.g6b"
+#include "exa_wm_ca.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_maskca_projective[][4] = {
+#include "exa_wm_src_projective.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_mask_projective.g6b"
+#include "exa_wm_mask_sample_argb.g6b"
+#include "exa_wm_ca.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_affine[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_a.g6b"
+#include "exa_wm_mask_affine.g6b"
+#include "exa_wm_mask_sample_argb.g6b"
+#include "exa_wm_ca_srcalpha.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_projective[][4] = {
+#include "exa_wm_src_projective.g6b"
+#include "exa_wm_src_sample_a.g6b"
+#include "exa_wm_mask_projective.g6b"
+#include "exa_wm_mask_sample_argb.g6b"
+#include "exa_wm_ca_srcalpha.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_masknoca_affine[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_mask_affine.g6b"
+#include "exa_wm_mask_sample_a.g6b"
+#include "exa_wm_noca.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_masknoca_projective[][4] = {
+#include "exa_wm_src_projective.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_mask_projective.g6b"
+#include "exa_wm_mask_sample_a.g6b"
+#include "exa_wm_noca.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_packed[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_yuv_rgb.g6b"
+#include "exa_wm_write.g6b"
+};
+
+static const uint32_t ps_kernel_planar[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_planar.g6b"
+#include "exa_wm_yuv_rgb.g6b"
+#include "exa_wm_write.g6b"
+};
+
+#define compare(old) brw_test_compare(__FUNCTION__, p.gen, p.store, p.nr_insn, (struct brw_instruction *)old, ARRAY_SIZE(old))
+
+#if 0
+static void wm_src_affine(struct brw_compile *p)
+{
+	brw_PLN(p, brw_message_reg(2), brw_vec1_grf(6,0), brw_vec8_grf(2,0));
+	brw_PLN(p, brw_message_reg(3), brw_vec1_grf(6,0), brw_vec8_grf(4,0));
+	brw_PLN(p, brw_message_reg(4), brw_vec1_grf(6,4), brw_vec8_grf(2,0));
+	brw_PLN(p, brw_message_reg(5), brw_vec1_grf(6,4), brw_vec8_grf(4,0));
+}
+
+static void wm_src_sample_argb(struct brw_compile *p)
+{
+	static const uint32_t fragment[][4] = {
+#include "exa_wm_src_affine.g6b"
+#include "exa_wm_src_sample_argb.g6b"
+#include "exa_wm_write.g6b"
+	};
+	int n;
+
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	brw_MOV(p,
+		retype(brw_vec1_grf(0,2), BRW_REGISTER_TYPE_UD),
+		brw_imm_ud(0));
+	brw_pop_insn_state(p);
+
+	brw_SAMPLE(p,
+		   retype(vec16(brw_vec8_grf(14, 0)), BRW_REGISTER_TYPE_UW),
+		   1,
+		   retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+		   1, 0,
+		   WRITEMASK_XYZW,
+		   GEN5_SAMPLER_MESSAGE_SAMPLE,
+		   8,
+		   5,
+		   true,
+		   BRW_SAMPLER_SIMD_MODE_SIMD16);
+
+
+	for (n = 0; n < p->nr_insn; n++) {
+		brw_disasm(stdout, &p->store[n], 60);
+	}
+
+	printf("\n\n");
+	for (n = 0; n < ARRAY_SIZE(fragment); n++) {
+		brw_disasm(stdout,
+			   (const struct brw_instruction *)&fragment[n][0],
+			   60);
+	}
+}
+
+static void wm_write(struct brw_compile *p)
+{
+}
+#endif
+
+static void gen6_ps_nomask_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 60, store);
+	brw_wm_kernel__affine(&p, 16);
+
+	compare(ps_kernel_nomask_affine);
+}
+
+static void gen6_ps_mask_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 60, store);
+	brw_wm_kernel__affine_mask(&p, 16);
+
+	compare(ps_kernel_masknoca_affine);
+}
+
+static void gen6_ps_nomask_projective(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, 60, store);
+	brw_wm_kernel__projective(&p, 16);
+
+	compare(ps_kernel_nomask_projective);
+}
+
+void brw_test_gen6(void)
+{
+	gen6_ps_nomask_affine();
+	gen6_ps_mask_affine();
+
+	gen6_ps_nomask_projective();
+}
diff --git a/cogl/driver/drm/brw/brw_test_gen7.c b/cogl/driver/drm/brw/brw_test_gen7.c
new file mode 100644
index 00000000..085b25cc
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_test_gen7.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "brw_test.h"
+
+#include <string.h>
+
+static const uint32_t ps_kernel_nomask_affine[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_nomask_projective[][4] = {
+#include "exa_wm_src_projective.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_maskca_affine[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_mask_affine.g7b"
+#include "exa_wm_mask_sample_argb.g7b"
+#include "exa_wm_ca.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_maskca_projective[][4] = {
+#include "exa_wm_src_projective.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_mask_projective.g7b"
+#include "exa_wm_mask_sample_argb.g7b"
+#include "exa_wm_ca.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_affine[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_a.g7b"
+#include "exa_wm_mask_affine.g7b"
+#include "exa_wm_mask_sample_argb.g7b"
+#include "exa_wm_ca_srcalpha.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_maskca_srcalpha_projective[][4] = {
+#include "exa_wm_src_projective.g7b"
+#include "exa_wm_src_sample_a.g7b"
+#include "exa_wm_mask_projective.g7b"
+#include "exa_wm_mask_sample_argb.g7b"
+#include "exa_wm_ca_srcalpha.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_masknoca_affine[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_mask_affine.g7b"
+#include "exa_wm_mask_sample_a.g7b"
+#include "exa_wm_noca.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_masknoca_projective[][4] = {
+#include "exa_wm_src_projective.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_mask_projective.g7b"
+#include "exa_wm_mask_sample_a.g7b"
+#include "exa_wm_noca.g6b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_packed[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_argb.g7b"
+#include "exa_wm_yuv_rgb.g7b"
+#include "exa_wm_write.g7b"
+};
+
+static const uint32_t ps_kernel_planar[][4] = {
+#include "exa_wm_src_affine.g7b"
+#include "exa_wm_src_sample_planar.g7b"
+#include "exa_wm_yuv_rgb.g7b"
+#include "exa_wm_write.g7b"
+};
+
+#define compare(old) brw_test_compare(__FUNCTION__, p.gen, p.store, p.nr_insn, (struct brw_instruction *)old, ARRAY_SIZE(old))
+#define GEN 70
+
+static void gen7_ps_nomask_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__affine(&p, 8);
+
+	compare(ps_kernel_nomask_affine);
+}
+
+static void gen7_ps_mask_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__affine_mask(&p, 8);
+
+	compare(ps_kernel_masknoca_affine);
+}
+
+static void gen7_ps_maskca_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__affine_mask_ca(&p, 8);
+
+	compare(ps_kernel_maskca_affine);
+}
+
+static void gen7_ps_masksa_affine(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__affine_mask_sa(&p, 8);
+
+	compare(ps_kernel_maskca_srcalpha_affine);
+}
+
+static void gen7_ps_nomask_projective(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__projective(&p, 8);
+
+	compare(ps_kernel_nomask_projective);
+}
+
+static void gen7_ps_opacity(void)
+{
+	uint32_t store[1024];
+	struct brw_compile p;
+
+	brw_compile_init(&p, GEN, store);
+	brw_wm_kernel__affine_opacity(&p, 16);
+
+	compare(ps_kernel_nomask_affine);
+}
+
+void brw_test_gen7(void)
+{
+	gen7_ps_nomask_affine();
+	gen7_ps_mask_affine();
+	gen7_ps_maskca_affine();
+	gen7_ps_masksa_affine();
+
+	gen7_ps_nomask_projective();
+
+	gen7_ps_opacity();
+}
diff --git a/cogl/driver/drm/brw/brw_wm.c b/cogl/driver/drm/brw/brw_wm.c
new file mode 100644
index 00000000..f54e55ef
--- /dev/null
+++ b/cogl/driver/drm/brw/brw_wm.c
@@ -0,0 +1,681 @@
+#include "brw.h"
+
+#define X16 8
+#define Y16 10
+
+static void brw_wm_xy(struct brw_compile *p, int dw)
+{
+	struct brw_reg r1 = brw_vec1_grf(1, 0);
+	struct brw_reg r1_uw = __retype_uw(r1);
+	struct brw_reg x_uw, y_uw;
+
+	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+	if (dw == 16) {
+		x_uw = brw_uw16_grf(30, 0);
+		y_uw = brw_uw16_grf(28, 0);
+	} else {
+		x_uw = brw_uw8_grf(30, 0);
+		y_uw = brw_uw8_grf(28, 0);
+	}
+
+	brw_ADD(p,
+		x_uw,
+		__stride(__suboffset(r1_uw, 4), 2, 4, 0),
+		brw_imm_v(0x10101010));
+	brw_ADD(p,
+		y_uw,
+		__stride(__suboffset(r1_uw, 5), 2, 4, 0),
+		brw_imm_v(0x11001100));
+
+	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+	brw_ADD(p, brw_vec8_grf(X16, 0), vec8(x_uw), brw_negate(r1));
+	brw_ADD(p, brw_vec8_grf(Y16, 0), vec8(y_uw), brw_negate(__suboffset(r1, 1)));
+}
+
+static void brw_wm_affine_st(struct brw_compile *p, int dw,
+			     int channel, int msg)
+{
+	int uv;
+
+	if (dw == 16) {
+		brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+		uv = p->gen >= 60 ? 6 : 3;
+	} else {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		uv = p->gen >= 60 ? 4 : 3;
+	}
+	uv += 2*channel;
+
+	msg++;
+	if (p->gen >= 60) {
+		brw_PLN(p,
+			brw_message_reg(msg),
+			brw_vec1_grf(uv, 0),
+			brw_vec8_grf(2, 0));
+		msg += dw/8;
+
+		brw_PLN(p,
+			brw_message_reg(msg),
+			brw_vec1_grf(uv, 4),
+			brw_vec8_grf(2, 0));
+	} else {
+		struct brw_reg r = brw_vec1_grf(uv, 0);
+
+		brw_LINE(p, brw_null_reg(), __suboffset(r, 0), brw_vec8_grf(X16, 0));
+		brw_MAC(p, brw_message_reg(msg), __suboffset(r, 1), brw_vec8_grf(Y16, 0));
+		msg += dw/8;
+
+		brw_LINE(p, brw_null_reg(), __suboffset(r, 4), brw_vec8_grf(X16, 0));
+		brw_MAC(p, brw_message_reg(msg), __suboffset(r, 5), brw_vec8_grf(Y16, 0));
+	}
+}
+
+static inline unsigned simd(int dw)
+{
+	return dw == 16 ? BRW_SAMPLER_SIMD_MODE_SIMD16 : BRW_SAMPLER_SIMD_MODE_SIMD8;
+}
+
+static inline struct brw_reg sample_result(int dw, int result)
+{
+	return brw_reg(BRW_GENERAL_REGISTER_FILE, result, 0,
+		       BRW_REGISTER_TYPE_UW,
+		       dw == 16 ? BRW_VERTICAL_STRIDE_16 : BRW_VERTICAL_STRIDE_8,
+		       dw == 16 ? BRW_WIDTH_16 : BRW_WIDTH_8,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYZW,
+		       WRITEMASK_XYZW);
+}
+
+static int brw_wm_sample(struct brw_compile *p, int dw,
+			 int channel, int msg, int result)
+{
+	struct brw_reg src0;
+	bool header;
+	int len;
+
+	len = dw == 16 ? 4 : 2;
+	if (p->gen >= 60) {
+		header = false;
+		src0 = brw_message_reg(++msg);
+	} else {
+		header = true;
+		src0 = brw_vec8_grf(0, 0);
+	}
+
+	brw_SAMPLE(p, sample_result(dw, result), msg, src0,
+		   channel+1, channel, WRITEMASK_XYZW, 0,
+		   2*len, len+header, header, simd(dw));
+	return result;
+}
+
+static int brw_wm_sample__alpha(struct brw_compile *p, int dw,
+				int channel, int msg, int result)
+{
+	struct brw_reg src0;
+	int mlen, rlen;
+
+	if (dw == 8) {
+		/* SIMD8 sample return is not masked */
+		mlen = 3;
+		rlen = 4;
+	} else {
+		mlen = 5;
+		rlen = 2;
+	}
+
+	if (p->gen >= 60)
+		src0 = brw_message_reg(msg);
+	else
+		src0 = brw_vec8_grf(0, 0);
+
+	brw_SAMPLE(p, sample_result(dw, result), msg, src0,
+		   channel+1, channel, WRITEMASK_W, 0,
+		   rlen, mlen, true, simd(dw));
+
+	if (dw == 8)
+		result += 3;
+
+	return result;
+}
+
+static int brw_wm_affine(struct brw_compile *p, int dw,
+			 int channel, int msg, int result)
+{
+	brw_wm_affine_st(p, dw, channel, msg);
+	return brw_wm_sample(p, dw, channel, msg, result);
+}
+
+static int brw_wm_affine__alpha(struct brw_compile *p, int dw,
+				int channel, int msg, int result)
+{
+	brw_wm_affine_st(p, dw, channel, msg);
+	return brw_wm_sample__alpha(p, dw, channel, msg, result);
+}
+
+static inline struct brw_reg null_result(int dw)
+{
+	return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0,
+		       BRW_REGISTER_TYPE_UW,
+		       dw == 16 ? BRW_VERTICAL_STRIDE_16 : BRW_VERTICAL_STRIDE_8,
+		       dw == 16 ? BRW_WIDTH_16 : BRW_WIDTH_8,
+		       BRW_HORIZONTAL_STRIDE_1,
+		       BRW_SWIZZLE_XYZW,
+		       WRITEMASK_XYZW);
+}
+
+static void brw_fb_write(struct brw_compile *p, int dw)
+{
+	struct brw_instruction *insn;
+	unsigned msg_control, msg_type, msg_len;
+	struct brw_reg src0;
+	bool header;
+
+	if (dw == 16) {
+		brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+		msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+		msg_len = 8;
+	} else {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+		msg_len = 4;
+	}
+
+	if (p->gen < 60) {
+		brw_push_insn_state(p);
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+		brw_MOV(p, brw_message_reg(1), brw_vec8_grf(1, 0));
+		brw_pop_insn_state(p);
+
+		msg_len += 2;
+	}
+
+	/* The execution mask is ignored for render target writes. */
+	insn = brw_next_insn(p, BRW_OPCODE_SEND);
+	insn->header.predicate_control = 0;
+	insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+	if (p->gen >= 60) {
+		msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+		src0 = brw_message_reg(2);
+		header = false;
+	} else {
+		insn->header.destreg__conditionalmod = 0;
+		msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+		src0 = __retype_uw(brw_vec8_grf(0, 0));
+		header = true;
+	}
+
+	brw_set_dest(p, insn, null_result(dw));
+	brw_set_src0(p, insn, src0);
+	brw_set_dp_write_message(p, insn, 0,
+				 msg_control, msg_type, msg_len,
+				 header, true, 0, true, false);
+}
+
+static void brw_wm_write(struct brw_compile *p, int dw, int src)
+{
+	int n;
+
+	if (dw == 8 && p->gen >= 60) {
+		/* XXX pixel execution mask? */
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+		brw_MOV(p, brw_message_reg(2), brw_vec8_grf(src+0, 0));
+		brw_MOV(p, brw_message_reg(3), brw_vec8_grf(src+1, 0));
+		brw_MOV(p, brw_message_reg(4), brw_vec8_grf(src+2, 0));
+		brw_MOV(p, brw_message_reg(5), brw_vec8_grf(src+3, 0));
+		goto done;
+	}
+
+	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+	for (n = 0; n < 4; n++) {
+		if (p->gen >= 60) {
+			brw_MOV(p,
+				brw_message_reg(2 + 2*n),
+				brw_vec8_grf(src + 2*n, 0));
+		} else if (p->gen >= 45 && dw == 16) {
+			brw_MOV(p,
+				brw_message_reg(2 + n + BRW_MRF_COMPR4),
+				brw_vec8_grf(src + 2*n, 0));
+		} else {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_MOV(p,
+				brw_message_reg(2 + n),
+				brw_vec8_grf(src + 2*n, 0));
+
+			if (dw == 16) {
+				brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+				brw_MOV(p,
+					brw_message_reg(2 + n + 4),
+					brw_vec8_grf(src + 2*n+1, 0));
+			}
+		}
+	}
+
+done:
+	brw_fb_write(p, dw);
+}
+
+static void brw_wm_write__mask(struct brw_compile *p, int dw,
+			       int src, int mask)
+{
+	int n;
+
+	if (dw == 8 && p->gen >= 60) {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+		brw_MUL(p,
+			brw_message_reg(2),
+			brw_vec8_grf(src+0, 0),
+			brw_vec8_grf(mask, 0));
+		brw_MUL(p,
+			brw_message_reg(3),
+			brw_vec8_grf(src+1, 0),
+			brw_vec8_grf(mask, 0));
+		brw_MUL(p,
+			brw_message_reg(4),
+			brw_vec8_grf(src+2, 0),
+			brw_vec8_grf(mask, 0));
+		brw_MUL(p,
+			brw_message_reg(5),
+			brw_vec8_grf(src+3, 0),
+			brw_vec8_grf(mask, 0));
+
+		goto done;
+	}
+
+	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+	for (n = 0; n < 4; n++) {
+		if (p->gen >= 60) {
+			brw_MUL(p,
+				brw_message_reg(2 + 2*n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask, 0));
+		} else if (p->gen >= 45 && dw == 16) {
+			brw_MUL(p,
+				brw_message_reg(2 + n + BRW_MRF_COMPR4),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask, 0));
+		} else {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_MUL(p,
+				brw_message_reg(2 + n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask, 0));
+
+			if (dw == 16) {
+				brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+				brw_MUL(p,
+					brw_message_reg(2 + n + 4),
+					brw_vec8_grf(src + 2*n+1, 0),
+					brw_vec8_grf(mask+1, 0));
+			}
+		}
+	}
+
+done:
+	brw_fb_write(p, dw);
+}
+
+static void brw_wm_write__opacity(struct brw_compile *p, int dw,
+				  int src, int mask)
+{
+	int n;
+
+	if (dw == 8 && p->gen >= 60) {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+		brw_MUL(p,
+			brw_message_reg(2),
+			brw_vec8_grf(src+0, 0),
+			brw_vec1_grf(mask, 3));
+		brw_MUL(p,
+			brw_message_reg(3),
+			brw_vec8_grf(src+1, 0),
+			brw_vec1_grf(mask, 3));
+		brw_MUL(p,
+			brw_message_reg(4),
+			brw_vec8_grf(src+2, 0),
+			brw_vec1_grf(mask, 3));
+		brw_MUL(p,
+			brw_message_reg(5),
+			brw_vec8_grf(src+3, 0),
+			brw_vec1_grf(mask, 3));
+
+		goto done;
+	}
+
+	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+	for (n = 0; n < 4; n++) {
+		if (p->gen >= 60) {
+			brw_MUL(p,
+				brw_message_reg(2 + 2*n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec1_grf(mask, 3));
+		} else if (p->gen >= 45 && dw == 16) {
+			brw_MUL(p,
+				brw_message_reg(2 + n + BRW_MRF_COMPR4),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec1_grf(mask, 3));
+		} else {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_MUL(p,
+				brw_message_reg(2 + n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec1_grf(mask, 3));
+
+			if (dw == 16) {
+				brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+				brw_MUL(p,
+					brw_message_reg(2 + n + 4),
+					brw_vec8_grf(src + 2*n+1, 0),
+					brw_vec1_grf(mask, 3));
+			}
+		}
+	}
+
+done:
+	brw_fb_write(p, dw);
+}
+
+static void brw_wm_write__mask_ca(struct brw_compile *p, int dw,
+				  int src, int mask)
+{
+	int n;
+
+	if (dw == 8 && p->gen >= 60) {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+		brw_MUL(p,
+			brw_message_reg(2),
+			brw_vec8_grf(src  + 0, 0),
+			brw_vec8_grf(mask + 0, 0));
+		brw_MUL(p,
+			brw_message_reg(3),
+			brw_vec8_grf(src  + 1, 0),
+			brw_vec8_grf(mask + 1, 0));
+		brw_MUL(p,
+			brw_message_reg(4),
+			brw_vec8_grf(src  + 2, 0),
+			brw_vec8_grf(mask + 2, 0));
+		brw_MUL(p,
+			brw_message_reg(5),
+			brw_vec8_grf(src  + 3, 0),
+			brw_vec8_grf(mask + 3, 0));
+
+		goto done;
+	}
+
+	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+	for (n = 0; n < 4; n++) {
+		if (p->gen >= 60) {
+			brw_MUL(p,
+				brw_message_reg(2 + 2*n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask + 2*n, 0));
+		} else if (p->gen >= 45 && dw == 16) {
+			brw_MUL(p,
+				brw_message_reg(2 + n + BRW_MRF_COMPR4),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask + 2*n, 0));
+		} else {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_MUL(p,
+				brw_message_reg(2 + n),
+				brw_vec8_grf(src + 2*n, 0),
+				brw_vec8_grf(mask + 2*n, 0));
+
+			if (dw == 16) {
+				brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+				brw_MUL(p,
+					brw_message_reg(2 + n + 4),
+					brw_vec8_grf(src + 2*n + 1, 0),
+					brw_vec8_grf(mask + 2*n + 1, 0));
+			}
+		}
+	}
+
+done:
+	brw_fb_write(p, dw);
+}
+
+bool
+brw_wm_kernel__affine(struct brw_compile *p, int dispatch)
+{
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+	brw_wm_write(p, dispatch, brw_wm_affine(p, dispatch, 0, 1, 12));
+
+	return true;
+}
+
+bool
+brw_wm_kernel__affine_mask(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_affine(p, dispatch, 0, 1, 12);
+	mask = brw_wm_affine__alpha(p, dispatch, 1, 6, 20);
+	brw_wm_write__mask(p, dispatch, src, mask);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__affine_mask_ca(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_affine(p, dispatch, 0, 1, 12);
+	mask = brw_wm_affine(p, dispatch, 1, 6, 20);
+	brw_wm_write__mask_ca(p, dispatch, src, mask);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__affine_mask_sa(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_affine__alpha(p, dispatch, 0, 1, 12);
+	mask = brw_wm_affine(p, dispatch, 1, 6, 16);
+	brw_wm_write__mask(p, dispatch, mask, src);
+
+	return true;
+}
+
+/* Projective variants */
+
+static void brw_wm_projective_st(struct brw_compile *p, int dw,
+				 int channel, int msg)
+{
+	int uv;
+
+	if (dw == 16) {
+		brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+		uv = p->gen >= 60 ? 6 : 3;
+	} else {
+		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+		uv = p->gen >= 60 ? 4 : 3;
+	}
+	uv += 2*channel;
+
+	msg++;
+	if (p->gen >= 60) {
+		/* First compute 1/z */
+		brw_PLN(p,
+			brw_message_reg(msg),
+			brw_vec1_grf(uv+1, 0),
+			brw_vec8_grf(2, 0));
+
+		if (dw == 16) {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
+			brw_math_invert(p, brw_vec8_grf(31, 0), brw_vec8_grf(31, 0));
+			brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+		} else
+			brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
+		brw_PLN(p,
+			brw_vec8_grf(28, 0),
+			brw_vec1_grf(uv, 0),
+			brw_vec8_grf(2, 0));
+		brw_MUL(p,
+			brw_message_reg(msg),
+			brw_vec8_grf(28, 0),
+			brw_vec8_grf(30, 0));
+		msg += dw/8;
+
+		brw_PLN(p,
+			brw_vec8_grf(28, 0),
+			brw_vec1_grf(uv, 0),
+			brw_vec8_grf(4, 0));
+		brw_MUL(p,
+			brw_message_reg(msg),
+			brw_vec8_grf(28, 0),
+			brw_vec8_grf(30, 0));
+	} else {
+		struct brw_reg r = brw_vec1_grf(uv, 0);
+
+		/* First compute 1/z */
+		brw_LINE(p, brw_null_reg(), brw_vec1_grf(uv+1, 0), brw_vec8_grf(X16, 0));
+		brw_MAC(p, brw_vec8_grf(30, 0), brw_vec1_grf(uv+1, 1), brw_vec8_grf(Y16, 0));
+
+		if (dw == 16) {
+			brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+			brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
+			brw_math_invert(p, brw_vec8_grf(31, 0), brw_vec8_grf(31, 0));
+			brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+		} else
+			brw_math_invert(p, brw_vec8_grf(30, 0), brw_vec8_grf(30, 0));
+
+		/* Now compute the output s,t values */
+		brw_LINE(p, brw_null_reg(), __suboffset(r, 0), brw_vec8_grf(X16, 0));
+		brw_MAC(p, brw_vec8_grf(28, 0), __suboffset(r, 1), brw_vec8_grf(Y16, 0));
+		brw_MUL(p, brw_message_reg(msg), brw_vec8_grf(28, 0), brw_vec8_grf(30, 0));
+		msg += dw/8;
+
+		brw_LINE(p, brw_null_reg(), __suboffset(r, 4), brw_vec8_grf(X16, 0));
+		brw_MAC(p, brw_vec8_grf(28, 0), __suboffset(r, 5), brw_vec8_grf(Y16, 0));
+		brw_MUL(p, brw_message_reg(msg), brw_vec8_grf(28, 0), brw_vec8_grf(30, 0));
+	}
+}
+
+static int brw_wm_projective(struct brw_compile *p, int dw,
+			     int channel, int msg, int result)
+{
+	brw_wm_projective_st(p, dw, channel, msg);
+	return brw_wm_sample(p, dw, channel, msg, result);
+}
+
+static int brw_wm_projective__alpha(struct brw_compile *p, int dw,
+				     int channel, int msg, int result)
+{
+	brw_wm_projective_st(p, dw, channel, msg);
+	return brw_wm_sample__alpha(p, dw, channel, msg, result);
+}
+
+bool
+brw_wm_kernel__projective(struct brw_compile *p, int dispatch)
+{
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+	brw_wm_write(p, dispatch, brw_wm_projective(p, dispatch, 0, 1, 12));
+
+	return true;
+}
+
+bool
+brw_wm_kernel__projective_mask(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_projective(p, dispatch, 0, 1, 12);
+	mask = brw_wm_projective__alpha(p, dispatch, 1, 6, 20);
+	brw_wm_write__mask(p, dispatch, src, mask);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__projective_mask_ca(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_projective(p, dispatch, 0, 1, 12);
+	mask = brw_wm_projective(p, dispatch, 1, 6, 20);
+	brw_wm_write__mask_ca(p, dispatch, src, mask);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__projective_mask_sa(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60)
+		brw_wm_xy(p, dispatch);
+
+	src = brw_wm_projective__alpha(p, dispatch, 0, 1, 12);
+	mask = brw_wm_projective(p, dispatch, 1, 6, 16);
+	brw_wm_write__mask(p, dispatch, mask, src);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__affine_opacity(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60) {
+		brw_wm_xy(p, dispatch);
+		mask = 4;
+	} else
+		mask = dispatch == 16 ? 8 : 6;
+
+	src = brw_wm_affine(p, dispatch, 0, 1, 12);
+	brw_wm_write__opacity(p, dispatch, src, mask);
+
+	return true;
+}
+
+bool
+brw_wm_kernel__projective_opacity(struct brw_compile *p, int dispatch)
+{
+	int src, mask;
+
+	if (p->gen < 60) {
+		brw_wm_xy(p, dispatch);
+		mask = 4;
+	} else
+		mask = dispatch == 16 ? 8 : 6;
+
+	src = brw_wm_projective(p, dispatch, 0, 1, 12);
+	brw_wm_write__opacity(p, dispatch, src, mask);
+
+	return true;
+}
diff --git a/cogl/driver/drm/cogl-attribute-drm-private.h b/cogl/driver/drm/cogl-attribute-drm-private.h
new file mode 100644
index 00000000..382339c3
--- /dev/null
+++ b/cogl/driver/drm/cogl-attribute-drm-private.h
@@ -0,0 +1,42 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ *
+ * Authors:
+ *   Robert Bragg <robert@linux.intel.com>
+ */
+
+#ifndef _COGL_ATTRIBUTE_NOP_PRIVATE_H_
+#define _COGL_ATTRIBUTE_NOP_PRIVATE_H_
+
+#include "cogl-types.h"
+#include "cogl-context-private.h"
+
+void
+_cogl_drm_flush_attributes_state (CoglFramebuffer *framebuffer,
+                                  CoglPipeline *pipeline,
+                                  CoglFlushLayerState *layers_state,
+                                  CoglDrawFlags flags,
+                                  CoglAttribute **attributes,
+                                  int n_attributes);
+
+#endif /* _COGL_ATTRIBUTE_NOP_PRIVATE_H_ */
diff --git a/cogl/driver/drm/cogl-attribute-drm.c b/cogl/driver/drm/cogl-attribute-drm.c
new file mode 100644
index 00000000..4e490326
--- /dev/null
+++ b/cogl/driver/drm/cogl-attribute-drm.c
@@ -0,0 +1,43 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cogl-types.h"
+#include "cogl-framebuffer.h"
+#include "cogl-attribute.h"
+#include "cogl-attribute-private.h"
+#include "cogl-attribute-drm-private.h"
+
+void
+_cogl_drm_flush_attributes_state (CoglFramebuffer *framebuffer,
+                                  CoglPipeline *pipeline,
+                                  CoglFlushLayerState *layers_state,
+                                  CoglDrawFlags flags,
+                                  CoglAttribute **attributes,
+                                  int n_attributes)
+{
+}
diff --git a/cogl/driver/drm/cogl-clip-stack-drm-private.h b/cogl/driver/drm/cogl-clip-stack-drm-private.h
new file mode 100644
index 00000000..bff6dec2
--- /dev/null
+++ b/cogl/driver/drm/cogl-clip-stack-drm-private.h
@@ -0,0 +1,38 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ *
+ * Authors:
+ *   Robert Bragg <robert@linux.intel.com>
+ */
+
+#ifndef _COGL_CLIP_STACK_NOP_PRIVATE_H_
+#define _COGL_CLIP_STACK_NOP_PRIVATE_H_
+
+#include "cogl-types.h"
+#include "cogl-context-private.h"
+
+void
+_cogl_clip_stack_drm_flush (CoglClipStack *stack,
+                            CoglFramebuffer *framebuffer);
+
+#endif /* _COGL_CLIP_STACK_NOP_PRIVATE_H_ */
diff --git a/cogl/driver/drm/cogl-clip-stack-drm.c b/cogl/driver/drm/cogl-clip-stack-drm.c
new file mode 100644
index 00000000..2851059d
--- /dev/null
+++ b/cogl/driver/drm/cogl-clip-stack-drm.c
@@ -0,0 +1,37 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cogl-clip-stack.h"
+#include "cogl-clip-stack-drm-private.h"
+#include "cogl-framebuffer-private.h"
+
+void
+_cogl_clip_stack_drm_flush (CoglClipStack *stack,
+                            CoglFramebuffer *framebuffer)
+{
+}
diff --git a/cogl/driver/drm/cogl-driver-drm.c b/cogl/driver/drm/cogl-driver-drm.c
new file mode 100644
index 00000000..d076c76f
--- /dev/null
+++ b/cogl/driver/drm/cogl-driver-drm.c
@@ -0,0 +1,82 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+
+#include "cogl-private.h"
+#include "cogl-context-private.h"
+#include "cogl-feature-private.h"
+#include "cogl-renderer-private.h"
+#include "cogl-error-private.h"
+#include "cogl-framebuffer-drm-private.h"
+#include "cogl-texture-2d-drm-private.h"
+#include "cogl-attribute-drm-private.h"
+#include "cogl-clip-stack-drm-private.h"
+
+static CoglBool
+_cogl_driver_update_features (CoglContext *ctx,
+                              CoglError **error)
+{
+  /* _cogl_gpu_info_init (ctx, &ctx->gpu); */
+
+  ctx->private_feature_flags = 0;
+
+  return TRUE;
+}
+
+const CoglDriverVtable
+_cogl_driver_drm =
+  {
+    NULL, /* pixel_format_from_gl_internal */
+    NULL, /* pixel_format_to_gl */
+    _cogl_driver_update_features,
+    _cogl_offscreen_drm_allocate,
+    _cogl_offscreen_drm_free,
+    _cogl_framebuffer_drm_flush_state,
+    _cogl_framebuffer_drm_clear,
+    _cogl_framebuffer_drm_query_bits,
+    _cogl_framebuffer_drm_finish,
+    _cogl_framebuffer_drm_discard_buffers,
+    _cogl_framebuffer_drm_draw_attributes,
+    _cogl_framebuffer_drm_draw_indexed_attributes,
+    _cogl_framebuffer_drm_read_pixels_into_bitmap,
+    _cogl_texture_2d_drm_free,
+    _cogl_texture_2d_drm_can_create,
+    _cogl_texture_2d_drm_init,
+    _cogl_texture_2d_drm_allocate,
+    _cogl_texture_2d_drm_new_from_bitmap,
+#if defined (COGL_HAS_EGL_SUPPORT) && defined (EGL_KHR_image_base)
+    _cogl_egl_texture_2d_drm_new_from_image,
+#endif
+    _cogl_texture_2d_drm_copy_from_framebuffer,
+    _cogl_texture_2d_drm_get_gl_handle,
+    _cogl_texture_2d_drm_generate_mipmap,
+    _cogl_texture_2d_drm_copy_from_bitmap,
+    NULL, /* texture_2d_get_data */
+    _cogl_drm_flush_attributes_state,
+    _cogl_clip_stack_drm_flush,
+  };
diff --git a/cogl/driver/drm/cogl-framebuffer-drm-private.h b/cogl/driver/drm/cogl-framebuffer-drm-private.h
new file mode 100644
index 00000000..3728f1a1
--- /dev/null
+++ b/cogl/driver/drm/cogl-framebuffer-drm-private.h
@@ -0,0 +1,97 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ *
+ * Authors:
+ *   Robert Bragg <robert@linux.intel.com>
+ */
+
+#ifndef _COGL_FRAMEBUFFER_NOP_PRIVATE_H_
+#define _COGL_FRAMEBUFFER_NOP_PRIVATE_H_
+
+#include "cogl-types.h"
+#include "cogl-context-private.h"
+
+CoglBool
+_cogl_offscreen_drm_allocate (CoglOffscreen *offscreen,
+                             CoglError **error);
+
+void
+_cogl_offscreen_drm_free (CoglOffscreen *offscreen);
+
+void
+_cogl_framebuffer_drm_flush_state (CoglFramebuffer *draw_buffer,
+                                   CoglFramebuffer *read_buffer,
+                                   CoglFramebufferState state);
+
+void
+_cogl_framebuffer_drm_clear (CoglFramebuffer *framebuffer,
+                            unsigned long buffers,
+                            float red,
+                            float green,
+                            float blue,
+                            float alpha);
+
+void
+_cogl_framebuffer_drm_query_bits (CoglFramebuffer *framebuffer,
+                                 int *red,
+                                 int *green,
+                                 int *blue,
+                                 int *alpha);
+
+void
+_cogl_framebuffer_drm_finish (CoglFramebuffer *framebuffer);
+
+void
+_cogl_framebuffer_drm_discard_buffers (CoglFramebuffer *framebuffer,
+                                       unsigned long buffers);
+
+void
+_cogl_framebuffer_drm_draw_attributes (CoglFramebuffer *framebuffer,
+                                       CoglPipeline *pipeline,
+                                       CoglVerticesMode mode,
+                                       int first_vertex,
+                                       int n_vertices,
+                                       CoglAttribute **attributes,
+                                       int n_attributes,
+                                       CoglDrawFlags flags);
+
+void
+_cogl_framebuffer_drm_draw_indexed_attributes (CoglFramebuffer *framebuffer,
+                                               CoglPipeline *pipeline,
+                                               CoglVerticesMode mode,
+                                               int first_vertex,
+                                               int n_vertices,
+                                               CoglIndices *indices,
+                                               CoglAttribute **attributes,
+                                               int n_attributes,
+                                               CoglDrawFlags flags);
+
+CoglBool
+_cogl_framebuffer_drm_read_pixels_into_bitmap (CoglFramebuffer *framebuffer,
+                                               int x,
+                                               int y,
+                                               CoglReadPixelsFlags source,
+                                               CoglBitmap *bitmap,
+                                               CoglError **error);
+
+#endif /* _COGL_FRAMEBUFFER_NOP_PRIVATE_H_ */
diff --git a/cogl/driver/drm/cogl-framebuffer-drm.c b/cogl/driver/drm/cogl-framebuffer-drm.c
new file mode 100644
index 00000000..c3b608fb
--- /dev/null
+++ b/cogl/driver/drm/cogl-framebuffer-drm.c
@@ -0,0 +1,121 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2007,2008,2009,2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cogl-framebuffer-drm-private.h"
+
+#include <glib.h>
+
+
+void
+_cogl_framebuffer_drm_flush_state (CoglFramebuffer *draw_buffer,
+                                   CoglFramebuffer *read_buffer,
+                                   CoglFramebufferState state)
+{
+}
+
+CoglBool
+_cogl_offscreen_drm_allocate (CoglOffscreen *offscreen,
+                              CoglError **error)
+{
+  return TRUE;
+}
+
+void
+_cogl_offscreen_drm_free (CoglOffscreen *offscreen)
+{
+}
+
+void
+_cogl_framebuffer_drm_clear (CoglFramebuffer *framebuffer,
+                             unsigned long buffers,
+                             float red,
+                             float green,
+                             float blue,
+                             float alpha)
+{
+}
+
+void
+_cogl_framebuffer_drm_query_bits (CoglFramebuffer *framebuffer,
+                                  int *red,
+                                  int *green,
+                                  int *blue,
+                                  int *alpha)
+{
+  *red = 0;
+  *green = 0;
+  *blue = 0;
+  *alpha = 0;
+}
+
+void
+_cogl_framebuffer_drm_finish (CoglFramebuffer *framebuffer)
+{
+}
+
+void
+_cogl_framebuffer_drm_discard_buffers (CoglFramebuffer *framebuffer,
+                                       unsigned long buffers)
+{
+}
+
+void
+_cogl_framebuffer_drm_draw_attributes (CoglFramebuffer *framebuffer,
+                                       CoglPipeline *pipeline,
+                                       CoglVerticesMode mode,
+                                       int first_vertex,
+                                       int n_vertices,
+                                       CoglAttribute **attributes,
+                                       int n_attributes,
+                                       CoglDrawFlags flags)
+{
+}
+
+void
+_cogl_framebuffer_drm_draw_indexed_attributes (CoglFramebuffer *framebuffer,
+                                               CoglPipeline *pipeline,
+                                               CoglVerticesMode mode,
+                                               int first_vertex,
+                                               int n_vertices,
+                                               CoglIndices *indices,
+                                               CoglAttribute **attributes,
+                                               int n_attributes,
+                                               CoglDrawFlags flags)
+{
+}
+
+CoglBool
+_cogl_framebuffer_drm_read_pixels_into_bitmap (CoglFramebuffer *framebuffer,
+                                               int x,
+                                               int y,
+                                               CoglReadPixelsFlags source,
+                                               CoglBitmap *bitmap,
+                                               CoglError **error)
+{
+  return TRUE;
+}
diff --git a/cogl/driver/drm/cogl-texture-2d-drm-private.h b/cogl/driver/drm/cogl-texture-2d-drm-private.h
new file mode 100644
index 00000000..2f7cfaaf
--- /dev/null
+++ b/cogl/driver/drm/cogl-texture-2d-drm-private.h
@@ -0,0 +1,118 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ *
+ * Authors:
+ *   Robert Bragg <robert@linux.intel.com>
+ */
+
+#ifndef _COGL_TEXTURE_2D_NOP_PRIVATE_H_
+#define _COGL_TEXTURE_2D_NOP_PRIVATE_H_
+
+#include "cogl-types.h"
+#include "cogl-context-private.h"
+#include "cogl-texture.h"
+
+void
+_cogl_texture_2d_drm_free (CoglTexture2D *tex_2d);
+
+CoglBool
+_cogl_texture_2d_drm_can_create (CoglContext *ctx,
+                                 int width,
+                                 int height,
+                                 CoglPixelFormat internal_format);
+
+void
+_cogl_texture_2d_drm_init (CoglTexture2D *tex_2d);
+
+CoglTexture2D *
+_cogl_texture_2d_drm_new_with_size (CoglContext *ctx,
+                                    int width,
+                                    int height,
+                                    CoglPixelFormat internal_format,
+                                    CoglError **error);
+CoglBool
+_cogl_texture_2d_drm_allocate (CoglTexture *tex,
+                               CoglError **error);
+
+CoglTexture2D *
+_cogl_texture_2d_drm_new_from_bitmap (CoglBitmap *bmp,
+                                      CoglPixelFormat internal_format,
+                                      CoglError **error);
+
+#if defined (COGL_HAS_EGL_SUPPORT) && defined (EGL_KHR_image_base)
+CoglTexture2D *
+_cogl_egl_texture_2d_drm_new_from_image (CoglContext *ctx,
+                                         int width,
+                                         int height,
+                                         CoglPixelFormat format,
+                                         EGLImageKHR image,
+                                         CoglError **error);
+#endif
+
+void
+_cogl_texture_2d_drm_flush_legacy_texobj_filters (CoglTexture *tex,
+                                                  GLenum min_filter,
+                                                  GLenum mag_filter);
+
+void
+_cogl_texture_2d_drm_flush_legacy_texobj_wrap_modes (CoglTexture *tex,
+                                                     GLenum wrap_mode_s,
+                                                     GLenum wrap_mode_t,
+                                                     GLenum wrap_mode_p);
+
+void
+_cogl_texture_2d_drm_copy_from_framebuffer (CoglTexture2D *tex_2d,
+                                            int src_x,
+                                            int src_y,
+                                            int width,
+                                            int height,
+                                            CoglFramebuffer *src_fb,
+                                            int dst_x,
+                                            int dst_y,
+                                            int level);
+
+unsigned int
+_cogl_texture_2d_drm_get_gl_handle (CoglTexture2D *tex_2d);
+
+void
+_cogl_texture_2d_drm_generate_mipmap (CoglTexture2D *tex_2d);
+
+CoglBool
+_cogl_texture_2d_drm_copy_from_bitmap (CoglTexture2D *tex_2d,
+                                       int src_x,
+                                       int src_y,
+                                       int width,
+                                       int height,
+                                       CoglBitmap *bitmap,
+                                       int dst_x,
+                                       int dst_y,
+                                       int level,
+                                       CoglError **error);
+
+void
+_cogl_texture_2d_drm_get_data (CoglTexture2D *tex_2d,
+                               CoglPixelFormat format,
+                               size_t rowstride,
+                               uint8_t *data);
+
+#endif /* _COGL_TEXTURE_2D_NOP_PRIVATE_H_ */
diff --git a/cogl/driver/drm/cogl-texture-2d-drm.c b/cogl/driver/drm/cogl-texture-2d-drm.c
new file mode 100644
index 00000000..0ff82a9d
--- /dev/null
+++ b/cogl/driver/drm/cogl-texture-2d-drm.c
@@ -0,0 +1,167 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2009,2010,2011,2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ *
+ * Authors:
+ *  Neil Roberts   <neil@linux.intel.com>
+ *  Robert Bragg   <robert@linux.intel.com>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+
+#include "cogl-private.h"
+#include "cogl-texture-2d-drm-private.h"
+#include "cogl-texture-2d-private.h"
+#include "cogl-error-private.h"
+
+void
+_cogl_texture_2d_drm_free (CoglTexture2D *tex_2d)
+{
+}
+
+CoglBool
+_cogl_texture_2d_drm_can_create (CoglContext *ctx,
+                                 int width,
+                                 int height,
+                                 CoglPixelFormat internal_format)
+{
+  return TRUE;
+}
+
+void
+_cogl_texture_2d_drm_init (CoglTexture2D *tex_2d)
+{
+}
+
+CoglTexture2D *
+_cogl_texture_2d_drm_new_with_size (CoglContext *ctx,
+                                    int width,
+                                    int height,
+                                    CoglPixelFormat internal_format,
+                                    CoglError **error)
+{
+  return _cogl_texture_2d_create_base (ctx,
+                                       width, height,
+                                       internal_format);
+}
+
+CoglBool
+_cogl_texture_2d_drm_allocate (CoglTexture *tex,
+                               CoglError **error)
+{
+  return TRUE;
+}
+
+CoglTexture2D *
+_cogl_texture_2d_drm_new_from_bitmap (CoglBitmap *bmp,
+                                      CoglPixelFormat internal_format,
+                                      CoglError **error)
+{
+  return _cogl_texture_2d_drm_new_with_size (_cogl_bitmap_get_context (bmp),
+                                             cogl_bitmap_get_width (bmp),
+                                             cogl_bitmap_get_height (bmp),
+                                             internal_format,
+                                             error);
+}
+
+#if defined (COGL_HAS_EGL_SUPPORT) && defined (EGL_KHR_image_base)
+CoglTexture2D *
+_cogl_egl_texture_2d_drm_new_from_image (CoglContext *ctx,
+                                         int width,
+                                         int height,
+                                         CoglPixelFormat format,
+                                         EGLImageKHR image,
+                                         CoglError **error)
+{
+  _cogl_set_error (error,
+                   COGL_SYSTEM_ERROR,
+                   COGL_SYSTEM_ERROR_UNSUPPORTED,
+                   "Creating 2D textures from an EGLImage isn't "
+                   "supported by the NOP backend");
+  return NULL;
+}
+#endif
+
+void
+_cogl_texture_2d_drm_flush_legacy_texobj_filters (CoglTexture *tex,
+                                                  GLenum min_filter,
+                                                  GLenum mag_filter)
+{
+}
+
+void
+_cogl_texture_2d_drm_flush_legacy_texobj_wrap_modes (CoglTexture *tex,
+                                                     GLenum wrap_mode_s,
+                                                     GLenum wrap_mode_t,
+                                                     GLenum wrap_mode_p)
+{
+}
+
+void
+_cogl_texture_2d_drm_copy_from_framebuffer (CoglTexture2D *tex_2d,
+                                            int src_x,
+                                            int src_y,
+                                            int width,
+                                            int height,
+                                            CoglFramebuffer *src_fb,
+                                            int dst_x,
+                                            int dst_y,
+                                            int level)
+{
+}
+
+unsigned int
+_cogl_texture_2d_drm_get_gl_handle (CoglTexture2D *tex_2d)
+{
+  return 0;
+}
+
+void
+_cogl_texture_2d_drm_generate_mipmap (CoglTexture2D *tex_2d)
+{
+}
+
+CoglBool
+_cogl_texture_2d_drm_copy_from_bitmap (CoglTexture2D *tex_2d,
+                                       int src_x,
+                                       int src_y,
+                                       int width,
+                                       int height,
+                                       CoglBitmap *bitmap,
+                                       int dst_x,
+                                       int dst_y,
+                                       int level,
+                                       CoglError **error)
+{
+  return TRUE;
+}
+
+void
+_cogl_texture_2d_drm_get_data (CoglTexture2D *tex_2d,
+                               CoglPixelFormat format,
+                               size_t rowstride,
+                               uint8_t *data)
+{
+}
diff --git a/cogl/driver/drm/compiler.h b/cogl/driver/drm/compiler.h
new file mode 100644
index 00000000..ff80365e
--- /dev/null
+++ b/cogl/driver/drm/compiler.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef _SNA_COMPILER_H_
+#define _SNA_COMPILER_H_
+
+#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
+#define likely(expr) (__builtin_expect (!!(expr), 1))
+#define unlikely(expr) (__builtin_expect (!!(expr), 0))
+#define noinline __attribute__((noinline))
+#define force_inline inline __attribute__((always_inline))
+#define fastcall __attribute__((regparm(3)))
+#define must_check __attribute__((warn_unused_result))
+#define constant __attribute__((const))
+#else
+#define likely(expr) (expr)
+#define unlikely(expr) (expr)
+#define noinline
+#define force_inline
+#define fastcall
+#define must_check
+#define constant
+#endif
+
+#ifdef HAVE_VALGRIND
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+#define VG_CLEAR(s) VG(memset(&s, 0, sizeof(s)))
+
+#define COMPILE_TIME_ASSERT(E) ((void)sizeof(char[1 - 2*!(E)]))
+
+#endif /* _SNA_COMPILER_H_ */
diff --git a/cogl/driver/drm/intel_list.h b/cogl/driver/drm/intel_list.h
new file mode 100644
index 00000000..a3e3227c
--- /dev/null
+++ b/cogl/driver/drm/intel_list.h
@@ -0,0 +1,408 @@
+/*
+ * Copyright © 2010-2012 Intel Corporation
+ * Copyright © 2010 Francisco Jerez <currojerez@riseup.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_LIST_H_
+#define _INTEL_LIST_H_
+
+#include <assert.h>
+//#include <xorgVersion.h>
+
+#if 1 //XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,9,0,0,0) || XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,11,99,903,0)
+
+#include <stdbool.h>
+
+/**
+ * @file Classic doubly-link circular list implementation.
+ * For real usage examples of the linked list, see the file test/list.c
+ *
+ * Example:
+ * We need to keep a list of struct foo in the parent struct bar, i.e. what
+ * we want is something like this.
+ *
+ *     struct bar {
+ *          ...
+ *          struct foo *list_of_foos; -----> struct foo {}, struct foo {}, struct foo{}
+ *          ...
+ *     }
+ *
+ * We need one list head in bar and a list element in all list_of_foos (both are of
+ * data type 'struct list').
+ *
+ *     struct bar {
+ *          ...
+ *          struct list list_of_foos;
+ *          ...
+ *     }
+ *
+ *     struct foo {
+ *          ...
+ *          struct list entry;
+ *          ...
+ *     }
+ *
+ * Now we initialize the list head:
+ *
+ *     struct bar bar;
+ *     ...
+ *     list_init(&bar.list_of_foos);
+ *
+ * Then we create the first element and add it to this list:
+ *
+ *     struct foo *foo = malloc(...);
+ *     ....
+ *     list_add(&foo->entry, &bar.list_of_foos);
+ *
+ * Repeat the above for each element you want to add to the list. Deleting
+ * works with the element itself.
+ *      list_del(&foo->entry);
+ *      free(foo);
+ *
+ * Note: calling list_del(&bar.list_of_foos) will set bar.list_of_foos to an empty
+ * list again.
+ *
+ * Looping through the list requires a 'struct foo' as iterator and the
+ * name of the field the subnodes use.
+ *
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar.list_of_foos, entry) {
+ *      if (iterator->something == ...)
+ *             ...
+ * }
+ *
+ * Note: You must not call list_del() on the iterator if you continue the
+ * loop. You need to run the safe for-each loop instead:
+ *
+ * struct foo *iterator, *next;
+ * list_for_each_entry_safe(iterator, next, &bar.list_of_foos, entry) {
+ *      if (...)
+ *              list_del(&iterator->entry);
+ * }
+ *
+ */
+
+/**
+ * The linkage struct for list nodes. This struct must be part of your
+ * to-be-linked struct. struct list is required for both the head of the
+ * list and for each list node.
+ *
+ * Position and name of the struct list field is irrelevant.
+ * There are no requirements that elements of a list are of the same type.
+ * There are no requirements for a list head, any struct list can be a list
+ * head.
+ */
+struct list {
+    struct list *next, *prev;
+};
+
+/**
+ * Initialize the list as an empty list.
+ *
+ * Example:
+ * list_init(&bar->list_of_foos);
+ *
+ * @param The list to initialized.
+ */
+static void
+list_init(struct list *list)
+{
+    list->next = list->prev = list;
+}
+
+static inline void
+__list_add(struct list *entry,
+	    struct list *prev,
+	    struct list *next)
+{
+    next->prev = entry;
+    entry->next = next;
+    entry->prev = prev;
+    prev->next = entry;
+}
+
+/**
+ * Insert a new element after the given list head. The new element does not
+ * need to be initialised as empty list.
+ * The list changes from:
+ *      head → some element → ...
+ * to
+ *      head → new element → older element → ...
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_add(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_add(struct list *entry, struct list *head)
+{
+    __list_add(entry, head, head->next);
+}
+
+static inline void
+list_add_tail(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+static inline void list_replace(struct list *old,
+				struct list *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define list_for_each(pos, head)				\
+    for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * Append a new element to the end of the list given with this list head.
+ *
+ * The list changes from:
+ *      head → some element → ... → lastelement
+ * to
+ *      head → some element → ... → lastelement → new element
+ *
+ * Example:
+ * struct foo *newfoo = malloc(...);
+ * list_append(&newfoo->entry, &bar->list_of_foos);
+ *
+ * @param entry The new element to prepend to the list.
+ * @param head The existing list.
+ */
+static inline void
+list_append(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+
+static inline void
+__list_del(struct list *prev, struct list *next)
+{
+	assert(next->prev == prev->next);
+	next->prev = prev;
+	prev->next = next;
+}
+
+static inline void
+_list_del(struct list *entry)
+{
+    assert(entry->prev->next == entry);
+    assert(entry->next->prev == entry);
+    __list_del(entry->prev, entry->next);
+}
+
+/**
+ * Remove the element from the list it is in. Using this function will reset
+ * the pointers to/from this element so it is removed from the list. It does
+ * NOT free the element itself or manipulate it otherwise.
+ *
+ * Using list_del on a pure list head (like in the example at the top of
+ * this file) will NOT remove the first element from
+ * the list but rather reset the list as empty list.
+ *
+ * Example:
+ * list_del(&foo->entry);
+ *
+ * @param entry The element to remove.
+ */
+static inline void
+list_del(struct list *entry)
+{
+    _list_del(entry);
+    list_init(entry);
+}
+
+static inline void list_move(struct list *list, struct list *head)
+{
+	if (list->prev != head) {
+		_list_del(list);
+		list_add(list, head);
+	}
+}
+
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+	_list_del(list);
+	list_add_tail(list, head);
+}
+
+/**
+ * Check if the list is empty.
+ *
+ * Example:
+ * list_is_empty(&bar->list_of_foos);
+ *
+ * @return True if the list contains one or more elements or False otherwise.
+ */
+static inline bool
+list_is_empty(struct list *head)
+{
+    return head->next == head;
+}
+
+/**
+ * Alias of container_of
+ */
+#define list_entry(ptr, type, member) \
+    container_of(ptr, type, member)
+
+/**
+ * Retrieve the first list entry for the given list pointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_first_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the first list element.
+ */
+#define list_first_entry(ptr, type, member) \
+    list_entry((ptr)->next, type, member)
+
+/**
+ * Retrieve the last list entry for the given listpointer.
+ *
+ * Example:
+ * struct foo *first;
+ * first = list_last_entry(&bar->list_of_foos, struct foo, list_of_foos);
+ *
+ * @param ptr The list head
+ * @param type Data type of the list element to retrieve
+ * @param member Member name of the struct list field in the list element.
+ * @return A pointer to the last list element.
+ */
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define __container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr)						\
+	     - ((char *)&(sample)->member - (char *)(sample)))
+/**
+ * Loop through the list given by head and set pos to struct in the list.
+ *
+ * Example:
+ * struct foo *iterator;
+ * list_for_each_entry(iterator, &bar->list_of_foos, entry) {
+ *      [modify iterator]
+ * }
+ *
+ * This macro is not safe for node deletion. Use list_for_each_entry_safe
+ * instead.
+ *
+ * @param pos Iterator variable of the type of the list elements.
+ * @param head List head
+ * @param member Member name of the struct list in the list elements.
+ *
+ */
+#define list_for_each_entry(pos, head, member)				\
+    for (pos = __container_of((head)->next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.next, pos, member))
+
+#define list_for_each_entry_reverse(pos, head, member)				\
+    for (pos = __container_of((head)->prev, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.prev, pos, member))
+
+/**
+ * Loop through the list, keeping a backup pointer to the element. This
+ * macro allows for the deletion of a list element while looping through the
+ * list.
+ *
+ * See list_for_each_entry for more details.
+ */
+#define list_for_each_entry_safe(pos, tmp, head, member)		\
+    for (pos = __container_of((head)->next, pos, member),		\
+	 tmp = __container_of(pos->member.next, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = tmp, tmp = __container_of(pos->member.next, tmp, member))
+
+#else
+
+#include <list.h>
+
+static inline void
+list_add_tail(struct list *entry, struct list *head)
+{
+    __list_add(entry, head->prev, head);
+}
+
+static inline void
+_list_del(struct list *entry)
+{
+    assert(entry->prev->next == entry);
+    assert(entry->next->prev == entry);
+    __list_del(entry->prev, entry->next);
+}
+
+static inline void list_replace(struct list *old,
+				struct list *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_move(struct list *list, struct list *head)
+{
+	if (list->prev != head) {
+		_list_del(list);
+		list_add(list, head);
+	}
+}
+
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+	_list_del(list);
+	list_add_tail(list, head);
+}
+
+#define list_last_entry(ptr, type, member) \
+    list_entry((ptr)->prev, type, member)
+
+#define list_for_each_entry_reverse(pos, head, member)				\
+    for (pos = __container_of((head)->prev, pos, member);		\
+	 &pos->member != (head);					\
+	 pos = __container_of(pos->member.prev, pos, member))
+
+#endif
+
+#undef container_of
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *) &((type *)0)->member))
+
+#endif /* _INTEL_LIST_H_ */
+
diff --git a/cogl/driver/drm/kgem.c b/cogl/driver/drm/kgem.c
new file mode 100644
index 00000000..9c016941
--- /dev/null
+++ b/cogl/driver/drm/kgem.c
@@ -0,0 +1,5182 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <xf86drm.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#endif
+
+#if HAVE_SYS_SYSINFO_H
+#include <sys/sysinfo.h>
+#endif
+
+static struct kgem_bo *
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
+
+static struct kgem_bo *
+search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
+
+#define DBG_NO_HW 0
+#define DBG_NO_TILING 0
+#define DBG_NO_CACHE 0
+#define DBG_NO_CACHE_LEVEL 0
+#define DBG_NO_CPU 0
+#define DBG_NO_USERPTR 0
+#define DBG_NO_LLC 0
+#define DBG_NO_SEMAPHORES 0
+#define DBG_NO_MADV 0
+#define DBG_NO_UPLOAD_CACHE 0
+#define DBG_NO_UPLOAD_ACTIVE 0
+#define DBG_NO_MAP_UPLOAD 0
+#define DBG_NO_RELAXED_FENCING 0
+#define DBG_NO_SECURE_BATCHES 0
+#define DBG_DUMP 0
+
+#define SHOW_BATCH 0
+
+/* Worst case seems to be 965gm where we cannot write within a cacheline that
+ * is being simultaneously being read by the GPU, or within the sampler
+ * prefetch. In general, the chipsets seem to have a requirement that sampler
+ * offsets be aligned to a cacheline (64 bytes).
+ */
+#define UPLOAD_ALIGNMENT 128
+
+#define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
+#define NUM_PAGES(x) (((x) + PAGE_SIZE-1) / PAGE_SIZE)
+
+#define MAX_GTT_VMA_CACHE 512
+#define MAX_CPU_VMA_CACHE INT16_MAX
+#define MAP_PRESERVE_TIME 10
+
+#define MAP(ptr) ((void*)((uintptr_t)(ptr) & ~3))
+#define MAKE_CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1))
+#define MAKE_USER_MAP(ptr) ((void*)((uintptr_t)(ptr) | 3))
+#define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 2)
+#define __MAP_TYPE(ptr) ((uintptr_t)(ptr) & 3)
+
+#define LOCAL_I915_PARAM_HAS_SEMAPHORES		20
+#define LOCAL_I915_PARAM_HAS_SECURE_BATCHES	23
+
+#define LOCAL_I915_GEM_USERPTR       0x32
+#define LOCAL_IOCTL_I915_GEM_USERPTR DRM_IOWR (DRM_COMMAND_BASE + LOCAL_I915_GEM_USERPTR, struct local_i915_gem_userptr)
+struct local_i915_gem_userptr {
+	uint64_t user_ptr;
+	uint32_t user_size;
+	uint32_t flags;
+#define I915_USERPTR_READ_ONLY 0x1
+	uint32_t handle;
+};
+
+#define UNCACHED	0
+#define SNOOPED		1
+
+struct local_i915_gem_cacheing {
+	uint32_t handle;
+	uint32_t cacheing;
+};
+
+#define LOCAL_I915_GEM_SET_CACHEING	0x2f
+#define LOCAL_IOCTL_I915_GEM_SET_CACHEING DRM_IOW(DRM_COMMAND_BASE + LOCAL_I915_GEM_SET_CACHEING, struct local_i915_gem_cacheing)
+
+struct kgem_buffer {
+	struct kgem_bo base;
+	void *mem;
+	uint32_t used;
+	uint32_t need_io : 1;
+	uint32_t write : 2;
+	uint32_t mmapped : 1;
+};
+
+static struct kgem_bo *__kgem_freed_bo;
+static struct kgem_request *__kgem_freed_request;
+static struct drm_i915_gem_exec_object2 _kgem_dummy_exec;
+
+static inline int bytes(struct kgem_bo *bo)
+{
+	return __kgem_bo_size(bo);
+}
+
+#define bucket(B) (B)->size.pages.bucket
+#define num_pages(B) (B)->size.pages.count
+
+#ifdef DEBUG_MEMORY
+static void debug_alloc(struct kgem *kgem, size_t size)
+{
+	kgem->debug_memory.bo_allocs++;
+	kgem->debug_memory.bo_bytes += size;
+}
+static void debug_alloc__bo(struct kgem *kgem, struct kgem_bo *bo)
+{
+	debug_alloc(kgem, bytes(bo));
+}
+#else
+#define debug_alloc(k, b)
+#define debug_alloc__bo(k, b)
+#endif
+
+static void kgem_sna_reset(struct kgem *kgem)
+{
+	struct sna *sna = container_of(kgem, struct sna, kgem);
+
+	sna->render.reset(sna);
+	sna->blt_state.fill_bo = 0;
+}
+
+static void kgem_sna_flush(struct kgem *kgem)
+{
+	struct sna *sna = container_of(kgem, struct sna, kgem);
+
+	sna->render.flush(sna);
+
+	if (sna->render.solid_cache.dirty)
+		sna_render_flush_solid(sna);
+}
+
+static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
+{
+	struct drm_i915_gem_set_tiling set_tiling;
+	int ret;
+
+	if (DBG_NO_TILING)
+		return I915_TILING_NONE;
+
+	VG_CLEAR(set_tiling);
+	do {
+		set_tiling.handle = handle;
+		set_tiling.tiling_mode = tiling;
+		set_tiling.stride = stride;
+
+		ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+	return set_tiling.tiling_mode;
+}
+
+static bool gem_set_cacheing(int fd, uint32_t handle, int cacheing)
+{
+	struct local_i915_gem_cacheing arg;
+
+	VG_CLEAR(arg);
+	arg.handle = handle;
+	arg.cacheing = cacheing;
+	return drmIoctl(fd, LOCAL_IOCTL_I915_GEM_SET_CACHEING, &arg) == 0;
+}
+
+static uint32_t gem_userptr(int fd, void *ptr, int size, int read_only)
+{
+	struct local_i915_gem_userptr arg;
+
+	VG_CLEAR(arg);
+	arg.user_ptr = (uintptr_t)ptr;
+	arg.user_size = size;
+	arg.flags = 0;
+	if (read_only)
+		arg.flags |= I915_USERPTR_READ_ONLY;
+
+	if (drmIoctl(fd, LOCAL_IOCTL_I915_GEM_USERPTR, &arg)) {
+		DBG(("%s: failed to map %p + %d bytes: %d\n",
+		     __FUNCTION__, ptr, size, errno));
+		return 0;
+	}
+
+	return arg.handle;
+}
+
+static bool __kgem_throttle_retire(struct kgem *kgem, unsigned flags)
+{
+	if (flags & CREATE_NO_RETIRE) {
+		DBG(("%s: not retiring per-request\n", __FUNCTION__));
+		return false;
+	}
+
+	if (!kgem->need_retire) {
+		DBG(("%s: nothing to retire\n", __FUNCTION__));
+		return false;
+	}
+
+	if (kgem_retire(kgem))
+		return true;
+
+	if (flags & CREATE_NO_THROTTLE || !kgem->need_throttle) {
+		DBG(("%s: not throttling\n", __FUNCTION__));
+		return false;
+	}
+
+	kgem_throttle(kgem);
+	return kgem_retire(kgem);
+}
+
+static void *__kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_mmap_gtt mmap_arg;
+	void *ptr;
+
+	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__,
+	     bo->handle, bytes(bo)));
+	assert(bo->proxy == NULL);
+
+retry_gtt:
+	VG_CLEAR(mmap_arg);
+	mmap_arg.handle = bo->handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg)) {
+		ErrorF("%s: failed to retrieve GTT offset for handle=%d: %d\n",
+		       __FUNCTION__, bo->handle, errno);
+		(void)__kgem_throttle_retire(kgem, 0);
+		if (kgem_expire_cache(kgem))
+			goto retry_gtt;
+
+		return NULL;
+	}
+
+retry_mmap:
+	ptr = mmap(0, bytes(bo), PROT_READ | PROT_WRITE, MAP_SHARED,
+		   kgem->fd, mmap_arg.offset);
+	if (ptr == MAP_FAILED) {
+		ErrorF("%s: failed to mmap %d, %d bytes, into GTT domain: %d\n",
+		       __FUNCTION__, bo->handle, bytes(bo), errno);
+		if (__kgem_throttle_retire(kgem, 0))
+			goto retry_mmap;
+
+		ptr = NULL;
+	}
+
+	return ptr;
+}
+
+static int __gem_write(int fd, uint32_t handle,
+		       int offset, int length,
+		       const void *src)
+{
+	struct drm_i915_gem_pwrite pwrite;
+
+	DBG(("%s(handle=%d, offset=%d, len=%d)\n", __FUNCTION__,
+	     handle, offset, length));
+
+	VG_CLEAR(pwrite);
+	pwrite.handle = handle;
+	pwrite.offset = offset;
+	pwrite.size = length;
+	pwrite.data_ptr = (uintptr_t)src;
+	return drmIoctl(fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite);
+}
+
+static int gem_write(int fd, uint32_t handle,
+		     int offset, int length,
+		     const void *src)
+{
+	struct drm_i915_gem_pwrite pwrite;
+
+	DBG(("%s(handle=%d, offset=%d, len=%d)\n", __FUNCTION__,
+	     handle, offset, length));
+
+	VG_CLEAR(pwrite);
+	pwrite.handle = handle;
+	/* align the transfer to cachelines; fortuitously this is safe! */
+	if ((offset | length) & 63) {
+		pwrite.offset = offset & ~63;
+		pwrite.size = ALIGN(offset+length, 64) - pwrite.offset;
+		pwrite.data_ptr = (uintptr_t)src + pwrite.offset - offset;
+	} else {
+		pwrite.offset = offset;
+		pwrite.size = length;
+		pwrite.data_ptr = (uintptr_t)src;
+	}
+	return drmIoctl(fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite);
+}
+
+static int gem_read(int fd, uint32_t handle, const void *dst,
+		    int offset, int length)
+{
+	struct drm_i915_gem_pread pread;
+	int ret;
+
+	DBG(("%s(handle=%d, len=%d)\n", __FUNCTION__,
+	     handle, length));
+
+	VG_CLEAR(pread);
+	pread.handle = handle;
+	pread.offset = offset;
+	pread.size = length;
+	pread.data_ptr = (uintptr_t)dst;
+	ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_PREAD, &pread);
+	if (ret) {
+		DBG(("%s: failed, errno=%d\n", __FUNCTION__, errno));
+		return ret;
+	}
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(dst, length));
+	return 0;
+}
+
+static bool
+kgem_busy(struct kgem *kgem, int handle)
+{
+	struct drm_i915_gem_busy busy;
+
+	VG_CLEAR(busy);
+	busy.handle = handle;
+	busy.busy = !kgem->wedged;
+	(void)drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+	DBG(("%s: handle=%d, busy=%d, wedged=%d\n",
+	     __FUNCTION__, handle, busy.busy, kgem->wedged));
+
+	return busy.busy;
+}
+
+void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d, domain=%d\n",
+	     __FUNCTION__, bo->handle, bo->domain));
+	assert(bo->flush || !kgem_busy(kgem, bo->handle));
+
+	if (bo->rq)
+		kgem_retire(kgem);
+
+	if (bo->exec == NULL) {
+		DBG(("%s: retiring bo handle=%d (needed flush? %d), rq? %d\n",
+		     __FUNCTION__, bo->handle, bo->needs_flush, bo->rq != NULL));
+		assert(list_is_empty(&bo->vma));
+		bo->rq = NULL;
+		list_del(&bo->request);
+
+		bo->needs_flush = false;
+	}
+
+	bo->domain = DOMAIN_NONE;
+}
+
+bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
+		   const void *data, int length)
+{
+	assert(bo->refcnt);
+	assert(!bo->purged);
+	assert(bo->flush || !kgem_busy(kgem, bo->handle));
+	assert(bo->proxy == NULL);
+
+	assert(length <= bytes(bo));
+	if (gem_write(kgem->fd, bo->handle, 0, length, data))
+		return false;
+
+	DBG(("%s: flush=%d, domain=%d\n", __FUNCTION__, bo->flush, bo->domain));
+	kgem_bo_retire(kgem, bo);
+	return true;
+}
+
+static uint32_t gem_create(int fd, int num_pages)
+{
+	struct drm_i915_gem_create create;
+
+	VG_CLEAR(create);
+	create.handle = 0;
+	create.size = PAGE_SIZE * num_pages;
+	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
+
+	return create.handle;
+}
+
+static bool
+kgem_bo_set_purgeable(struct kgem *kgem, struct kgem_bo *bo)
+{
+#if DBG_NO_MADV
+	return true;
+#else
+	struct drm_i915_gem_madvise madv;
+
+	assert(bo->exec == NULL);
+	assert(!bo->purged);
+
+	VG_CLEAR(madv);
+	madv.handle = bo->handle;
+	madv.madv = I915_MADV_DONTNEED;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0) {
+		bo->purged = 1;
+		kgem->need_purge |= !madv.retained && bo->domain == DOMAIN_GPU;
+		return madv.retained;
+	}
+
+	return true;
+#endif
+}
+
+static bool
+kgem_bo_is_retained(struct kgem *kgem, struct kgem_bo *bo)
+{
+#if DBG_NO_MADV
+	return true;
+#else
+	struct drm_i915_gem_madvise madv;
+
+	if (!bo->purged)
+		return true;
+
+	VG_CLEAR(madv);
+	madv.handle = bo->handle;
+	madv.madv = I915_MADV_DONTNEED;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0)
+		return madv.retained;
+
+	return false;
+#endif
+}
+
+static bool
+kgem_bo_clear_purgeable(struct kgem *kgem, struct kgem_bo *bo)
+{
+#if DBG_NO_MADV
+	return true;
+#else
+	struct drm_i915_gem_madvise madv;
+
+	assert(bo->purged);
+
+	VG_CLEAR(madv);
+	madv.handle = bo->handle;
+	madv.madv = I915_MADV_WILLNEED;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv) == 0) {
+		bo->purged = !madv.retained;
+		kgem->need_purge |= !madv.retained && bo->domain == DOMAIN_GPU;
+		return madv.retained;
+	}
+
+	return false;
+#endif
+}
+
+static void gem_close(int fd, uint32_t handle)
+{
+	struct drm_gem_close close;
+
+	VG_CLEAR(close);
+	close.handle = handle;
+	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+constant inline static unsigned long __fls(unsigned long word)
+{
+	asm("bsr %1,%0"
+	    : "=r" (word)
+	    : "rm" (word));
+	return word;
+}
+
+constant inline static int cache_bucket(int num_pages)
+{
+	return __fls(num_pages);
+}
+
+static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
+				      int handle, int num_pages)
+{
+	assert(num_pages);
+	memset(bo, 0, sizeof(*bo));
+
+	bo->refcnt = 1;
+	bo->handle = handle;
+	num_pages(bo) = num_pages;
+	bucket(bo) = cache_bucket(num_pages);
+	bo->reusable = true;
+	bo->domain = DOMAIN_CPU;
+	list_init(&bo->request);
+	list_init(&bo->list);
+	list_init(&bo->vma);
+
+	return bo;
+}
+
+static struct kgem_bo *__kgem_bo_alloc(int handle, int num_pages)
+{
+	struct kgem_bo *bo;
+
+	if (__kgem_freed_bo) {
+		bo = __kgem_freed_bo;
+		__kgem_freed_bo = *(struct kgem_bo **)bo;
+	} else {
+		bo = malloc(sizeof(*bo));
+		if (bo == NULL)
+			return NULL;
+	}
+
+	return __kgem_bo_init(bo, handle, num_pages);
+}
+
+static struct kgem_request _kgem_static_request;
+
+static struct kgem_request *__kgem_request_alloc(void)
+{
+	struct kgem_request *rq;
+
+	rq = __kgem_freed_request;
+	if (rq) {
+		__kgem_freed_request = *(struct kgem_request **)rq;
+	} else {
+		rq = malloc(sizeof(*rq));
+		if (rq == NULL)
+			rq = &_kgem_static_request;
+	}
+
+	list_init(&rq->buffers);
+	rq->bo = NULL;
+	rq->ring = 0;
+
+	return rq;
+}
+
+static void __kgem_request_free(struct kgem_request *rq)
+{
+	_list_del(&rq->list);
+	*(struct kgem_request **)rq = __kgem_freed_request;
+	__kgem_freed_request = rq;
+}
+
+static struct list *inactive(struct kgem *kgem, int num_pages)
+{
+	return &kgem->inactive[cache_bucket(num_pages)];
+}
+
+static struct list *active(struct kgem *kgem, int num_pages, int tiling)
+{
+	return &kgem->active[cache_bucket(num_pages)][tiling];
+}
+
+static size_t
+agp_aperture_size(struct pci_device *dev, unsigned gen)
+{
+	/* XXX assume that only future chipsets are unknown and follow
+	 * the post gen2 PCI layout.
+	 */
+	return dev->regions[gen < 30 ? 0 : 2].size;
+}
+
+static size_t
+total_ram_size(void)
+{
+#if HAVE_SYS_SYSINFO_H
+	struct sysinfo info;
+	if (sysinfo(&info) == 0)
+		return info.totalram * info.mem_unit;
+#endif
+
+	return 0;
+}
+
+static size_t
+cpu_cache_size(void)
+{
+	FILE *file = fopen("/proc/cpuinfo", "r");
+	size_t size = -1;
+	if (file) {
+		size_t len = 0;
+		char *line = NULL;
+		while (getline(&line, &len, file) != -1) {
+			int mb;
+			if (sscanf(line, "cache size : %d KB", &mb) == 1) {
+				/* Paranoid check against gargantuan caches */
+				if (mb <= 1<<20)
+					size = mb * 1024;
+				break;
+			}
+		}
+		free(line);
+		fclose(file);
+	}
+	if (size == -1)
+		ErrorF("Unknown CPU cache size\n");
+	return size;
+}
+
+static int gem_param(struct kgem *kgem, int name)
+{
+	drm_i915_getparam_t gp;
+	int v = -1; /* No param uses the sign bit, reserve it for errors */
+
+	VG_CLEAR(gp);
+	gp.param = name;
+	gp.value = &v;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GETPARAM, &gp))
+		return -1;
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(&v, sizeof(v)));
+	return v;
+}
+
+static bool test_has_execbuffer2(struct kgem *kgem)
+{
+	struct drm_i915_gem_execbuffer2 execbuf;
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffer_count = 1;
+
+	return (drmIoctl(kgem->fd,
+			 DRM_IOCTL_I915_GEM_EXECBUFFER2,
+			 &execbuf) == -1 &&
+		errno == EFAULT);
+}
+
+static bool test_has_semaphores_enabled(struct kgem *kgem)
+{
+	FILE *file;
+	bool detected = false;
+	int ret;
+
+	if (DBG_NO_SEMAPHORES)
+		return false;
+
+	ret = gem_param(kgem, LOCAL_I915_PARAM_HAS_SEMAPHORES);
+	if (ret != -1)
+		return ret > 0;
+
+	file = fopen("/sys/module/i915/parameters/semaphores", "r");
+	if (file) {
+		int value;
+		if (fscanf(file, "%d", &value) == 1)
+			detected = value != 0;
+		fclose(file);
+	}
+
+	return detected;
+}
+
+static bool __kgem_throttle(struct kgem *kgem)
+{
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_THROTTLE, NULL) == 0)
+		return false;
+
+	return errno == EIO;
+}
+
+static bool is_hw_supported(struct kgem *kgem,
+			    struct pci_device *dev)
+{
+	if (DBG_NO_HW)
+		return false;
+
+	if (!test_has_execbuffer2(kgem))
+		return false;
+
+	if (kgem->gen == (unsigned)-1) /* unknown chipset, assume future gen */
+		return kgem->has_blt;
+
+	/* Although pre-855gm the GMCH is fubar, it works mostly. So
+	 * let the user decide through "NoAccel" whether or not to risk
+	 * hw acceleration.
+	 */
+
+	if (kgem->gen == 60 && dev->revision < 8) {
+		/* pre-production SNB with dysfunctional BLT */
+		return false;
+	}
+
+	if (kgem->gen >= 60) /* Only if the kernel supports the BLT ring */
+		return kgem->has_blt;
+
+	return true;
+}
+
+static bool test_has_relaxed_fencing(struct kgem *kgem)
+{
+	if (kgem->gen < 40) {
+		if (DBG_NO_RELAXED_FENCING)
+			return false;
+
+		return gem_param(kgem, I915_PARAM_HAS_RELAXED_FENCING) > 0;
+	} else
+		return true;
+}
+
+static bool test_has_llc(struct kgem *kgem)
+{
+	int has_llc = -1;
+
+	if (DBG_NO_LLC)
+		return false;
+
+#if defined(I915_PARAM_HAS_LLC) /* Expected in libdrm-2.4.31 */
+	has_llc = gem_param(kgem, I915_PARAM_HAS_LLC);
+#endif
+	if (has_llc == -1) {
+		DBG(("%s: no kernel/drm support for HAS_LLC, assuming support for LLC based on GPU generation\n", __FUNCTION__));
+		has_llc = kgem->gen >= 60;
+	}
+
+	return has_llc;
+}
+
+static bool test_has_cacheing(struct kgem *kgem)
+{
+	uint32_t handle;
+	bool ret;
+
+	if (DBG_NO_CACHE_LEVEL)
+		return false;
+
+	/* Incoherent blt and sampler hangs the GPU */
+	if (kgem->gen == 40)
+		return false;
+
+	handle = gem_create(kgem->fd, 1);
+	if (handle == 0)
+		return false;
+
+	ret = gem_set_cacheing(kgem->fd, handle, UNCACHED);
+	gem_close(kgem->fd, handle);
+	return ret;
+}
+
+static bool test_has_userptr(struct kgem *kgem)
+{
+#if defined(USE_USERPTR)
+	uint32_t handle;
+	void *ptr;
+
+	if (DBG_NO_USERPTR)
+		return false;
+
+	/* Incoherent blt and sampler hangs the GPU */
+	if (kgem->gen == 40)
+		return false;
+
+	ptr = malloc(PAGE_SIZE);
+	handle = gem_userptr(kgem->fd, ptr, PAGE_SIZE, false);
+	gem_close(kgem->fd, handle);
+	free(ptr);
+
+	return handle != 0;
+#else
+	return false;
+#endif
+}
+
+static bool test_has_secure_batches(struct kgem *kgem)
+{
+	if (DBG_NO_SECURE_BATCHES)
+		return false;
+
+	return gem_param(kgem, LOCAL_I915_PARAM_HAS_SECURE_BATCHES) > 0;
+}
+
+static int kgem_get_screen_index(struct kgem *kgem)
+{
+	struct sna *sna = container_of(kgem, struct sna, kgem);
+	return sna->scrn->scrnIndex;
+}
+
+void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
+{
+	struct drm_i915_gem_get_aperture aperture;
+	size_t totalram;
+	unsigned half_gpu_max;
+	unsigned int i, j;
+
+	DBG(("%s: fd=%d, gen=%d\n", __FUNCTION__, fd, gen));
+
+	memset(kgem, 0, sizeof(*kgem));
+
+	kgem->fd = fd;
+	kgem->gen = gen;
+
+	kgem->has_blt = gem_param(kgem, I915_PARAM_HAS_BLT) > 0;
+	DBG(("%s: has BLT ring? %d\n", __FUNCTION__,
+	     kgem->has_blt));
+
+	kgem->has_relaxed_delta =
+		gem_param(kgem, I915_PARAM_HAS_RELAXED_DELTA) > 0;
+	DBG(("%s: has relaxed delta? %d\n", __FUNCTION__,
+	     kgem->has_relaxed_delta));
+
+	kgem->has_relaxed_fencing = test_has_relaxed_fencing(kgem);
+	DBG(("%s: has relaxed fencing? %d\n", __FUNCTION__,
+	     kgem->has_relaxed_fencing));
+
+	kgem->has_llc = test_has_llc(kgem);
+	DBG(("%s: has shared last-level-cache? %d\n", __FUNCTION__,
+	     kgem->has_llc));
+
+	kgem->has_cacheing = test_has_cacheing(kgem);
+	DBG(("%s: has set-cache-level? %d\n", __FUNCTION__,
+	     kgem->has_cacheing));
+
+	kgem->has_userptr = test_has_userptr(kgem);
+	DBG(("%s: has userptr? %d\n", __FUNCTION__,
+	     kgem->has_userptr));
+
+	kgem->has_semaphores = false;
+	if (kgem->has_blt && test_has_semaphores_enabled(kgem))
+		kgem->has_semaphores = true;
+	DBG(("%s: semaphores enabled? %d\n", __FUNCTION__,
+	     kgem->has_semaphores));
+
+	kgem->can_blt_cpu = gen >= 30;
+	DBG(("%s: can blt to cpu? %d\n", __FUNCTION__,
+	     kgem->can_blt_cpu));
+
+	kgem->has_secure_batches = test_has_secure_batches(kgem);
+	DBG(("%s: can use privileged batchbuffers? %d\n", __FUNCTION__,
+	     kgem->has_secure_batches));
+
+	if (!is_hw_supported(kgem, dev)) {
+		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
+			   "Detected unsupported/dysfunctional hardware, disabling acceleration.\n");
+		kgem->wedged = 1;
+	} else if (__kgem_throttle(kgem)) {
+		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
+			   "Detected a hung GPU, disabling acceleration.\n");
+		kgem->wedged = 1;
+	}
+
+	kgem->batch_size = ARRAY_SIZE(kgem->batch);
+	if (gen == 22)
+		/* 865g cannot handle a batch spanning multiple pages */
+		kgem->batch_size = PAGE_SIZE / sizeof(uint32_t);
+	if (gen >= 70 && gen < 80)
+		kgem->batch_size = 16*1024;
+	if (!kgem->has_relaxed_delta && kgem->batch_size > 4*1024)
+		kgem->batch_size = 4*1024;
+
+	DBG(("%s: maximum batch size? %d\n", __FUNCTION__,
+	     kgem->batch_size));
+
+	kgem->min_alignment = 4;
+	if (gen < 40)
+		kgem->min_alignment = 64;
+
+	kgem->half_cpu_cache_pages = cpu_cache_size() >> 13;
+	DBG(("%s: half cpu cache %d pages\n", __FUNCTION__,
+	     kgem->half_cpu_cache_pages));
+
+	list_init(&kgem->requests[0]);
+	list_init(&kgem->requests[1]);
+	list_init(&kgem->batch_buffers);
+	list_init(&kgem->active_buffers);
+	list_init(&kgem->flushing);
+	list_init(&kgem->large);
+	list_init(&kgem->large_inactive);
+	list_init(&kgem->snoop);
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
+		list_init(&kgem->inactive[i]);
+	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
+		for (j = 0; j < ARRAY_SIZE(kgem->active[i]); j++)
+			list_init(&kgem->active[i][j]);
+	}
+	for (i = 0; i < ARRAY_SIZE(kgem->vma); i++) {
+		for (j = 0; j < ARRAY_SIZE(kgem->vma[i].inactive); j++)
+			list_init(&kgem->vma[i].inactive[j]);
+	}
+	kgem->vma[MAP_GTT].count = -MAX_GTT_VMA_CACHE;
+	kgem->vma[MAP_CPU].count = -MAX_CPU_VMA_CACHE;
+
+	kgem->next_request = __kgem_request_alloc();
+
+	DBG(("%s: cpu bo enabled %d: llc? %d, set-cache-level? %d, userptr? %d\n", __FUNCTION__,
+	     !DBG_NO_CPU && (kgem->has_llc | kgem->has_userptr | kgem->has_cacheing),
+	     kgem->has_llc, kgem->has_cacheing, kgem->has_userptr));
+
+	VG_CLEAR(aperture);
+	aperture.aper_size = 0;
+	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+	if (aperture.aper_size == 0)
+		aperture.aper_size = 64*1024*1024;
+
+	kgem->aperture_total = aperture.aper_size;
+	kgem->aperture_high = aperture.aper_size * 3/4;
+	kgem->aperture_low = aperture.aper_size * 1/3;
+	if (gen < 33) {
+		/* Severe alignment penalties */
+		kgem->aperture_high /= 2;
+		kgem->aperture_low /= 2;
+	}
+	DBG(("%s: aperture low=%d [%d], high=%d [%d]\n", __FUNCTION__,
+	     kgem->aperture_low, kgem->aperture_low / (1024*1024),
+	     kgem->aperture_high, kgem->aperture_high / (1024*1024)));
+
+	kgem->aperture_mappable = agp_aperture_size(dev, gen);
+	if (kgem->aperture_mappable == 0 ||
+	    kgem->aperture_mappable > aperture.aper_size)
+		kgem->aperture_mappable = aperture.aper_size;
+	DBG(("%s: aperture mappable=%d [%d MiB]\n", __FUNCTION__,
+	     kgem->aperture_mappable, kgem->aperture_mappable / (1024*1024)));
+
+	kgem->buffer_size = 64 * 1024;
+	while (kgem->buffer_size < kgem->aperture_mappable >> 10)
+		kgem->buffer_size *= 2;
+	DBG(("%s: buffer size=%d [%d KiB]\n", __FUNCTION__,
+	     kgem->buffer_size, kgem->buffer_size / 1024));
+
+	kgem->max_object_size = 2 * aperture.aper_size / 3;
+	kgem->max_gpu_size = kgem->max_object_size;
+	if (!kgem->has_llc)
+		kgem->max_gpu_size = MAX_CACHE_SIZE;
+	if (gen < 40) {
+		/* If we have to use fences for blitting, we have to make
+		 * sure we can fit them into the aperture.
+		 */
+		kgem->max_gpu_size = kgem->aperture_mappable / 2;
+		if (kgem->max_gpu_size > kgem->aperture_low)
+			kgem->max_gpu_size = kgem->aperture_low;
+	}
+
+	totalram = total_ram_size();
+	if (totalram == 0) {
+		DBG(("%s: total ram size unknown, assuming maximum of total aperture\n",
+		     __FUNCTION__));
+		totalram = kgem->aperture_total;
+	}
+	DBG(("%s: total ram=%ld\n", __FUNCTION__, (long)totalram));
+	if (kgem->max_object_size > totalram / 2)
+		kgem->max_object_size = totalram / 2;
+	if (kgem->max_gpu_size > totalram / 4)
+		kgem->max_gpu_size = totalram / 4;
+
+	half_gpu_max = kgem->max_gpu_size / 2;
+	if (kgem->gen >= 40)
+		kgem->max_cpu_size = half_gpu_max;
+	else
+		kgem->max_cpu_size = kgem->max_object_size;
+
+	kgem->max_copy_tile_size = (MAX_CACHE_SIZE + 1)/2;
+	if (kgem->max_copy_tile_size > half_gpu_max)
+		kgem->max_copy_tile_size = half_gpu_max;
+
+	if (kgem->has_llc)
+		kgem->max_upload_tile_size = kgem->max_copy_tile_size;
+	else
+		kgem->max_upload_tile_size = kgem->aperture_mappable / 4;
+	if (kgem->max_upload_tile_size > half_gpu_max)
+		kgem->max_upload_tile_size = half_gpu_max;
+
+	kgem->large_object_size = MAX_CACHE_SIZE;
+	if (kgem->large_object_size > kgem->max_gpu_size)
+		kgem->large_object_size = kgem->max_gpu_size;
+
+	if (kgem->has_llc | kgem->has_cacheing | kgem->has_userptr) {
+		if (kgem->large_object_size > kgem->max_cpu_size)
+			kgem->large_object_size = kgem->max_cpu_size;
+	} else
+		kgem->max_cpu_size = 0;
+	if (DBG_NO_CPU)
+		kgem->max_cpu_size = 0;
+
+	DBG(("%s: maximum object size=%d\n",
+	     __FUNCTION__, kgem->max_object_size));
+	DBG(("%s: large object thresold=%d\n",
+	     __FUNCTION__, kgem->large_object_size));
+	DBG(("%s: max object sizes (gpu=%d, cpu=%d, tile upload=%d, copy=%d)\n",
+	     __FUNCTION__,
+	     kgem->max_gpu_size, kgem->max_cpu_size,
+	     kgem->max_upload_tile_size, kgem->max_copy_tile_size));
+
+	/* Convert the aperture thresholds to pages */
+	kgem->aperture_low /= PAGE_SIZE;
+	kgem->aperture_high /= PAGE_SIZE;
+
+	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
+	if ((int)kgem->fence_max < 0)
+		kgem->fence_max = 5; /* minimum safe value for all hw */
+	DBG(("%s: max fences=%d\n", __FUNCTION__, kgem->fence_max));
+}
+
+/* XXX hopefully a good approximation */
+static uint32_t kgem_get_unique_id(struct kgem *kgem)
+{
+	uint32_t id;
+	id = ++kgem->unique_id;
+	if (id == 0)
+		id = ++kgem->unique_id;
+	return id;
+}
+
+inline static uint32_t kgem_pitch_alignment(struct kgem *kgem, unsigned flags)
+{
+	if (flags & CREATE_PRIME)
+		return 256;
+	if (flags & CREATE_SCANOUT)
+		return 64;
+	return kgem->min_alignment;
+}
+
+static uint32_t kgem_untiled_pitch(struct kgem *kgem,
+				   uint32_t width, uint32_t bpp,
+				   unsigned flags)
+{
+	width = ALIGN(width, 2) * bpp >> 3;
+	return ALIGN(width, kgem_pitch_alignment(kgem, flags));
+}
+
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size)
+{
+	if (kgem->gen <= 30) {
+		if (tiling) {
+			if (kgem->gen < 30) {
+				*tile_width = 128;
+				*tile_height = 16;
+				*tile_size = 2048;
+			} else {
+				*tile_width = 512;
+				*tile_height = 8;
+				*tile_size = 4096;
+			}
+		} else {
+			*tile_width = 1;
+			*tile_height = 1;
+			*tile_size = 1;
+		}
+	} else switch (tiling) {
+	default:
+	case I915_TILING_NONE:
+		*tile_width = 1;
+		*tile_height = 1;
+		*tile_size = 1;
+		break;
+	case I915_TILING_X:
+		*tile_width = 512;
+		*tile_height = 8;
+		*tile_size = 4096;
+		break;
+	case I915_TILING_Y:
+		*tile_width = 128;
+		*tile_height = 32;
+		*tile_size = 4096;
+		break;
+	}
+}
+
+static uint32_t kgem_surface_size(struct kgem *kgem,
+				  bool relaxed_fencing,
+				  unsigned flags,
+				  uint32_t width,
+				  uint32_t height,
+				  uint32_t bpp,
+				  uint32_t tiling,
+				  uint32_t *pitch)
+{
+	uint32_t tile_width, tile_height;
+	uint32_t size;
+
+	assert(width <= MAXSHORT);
+	assert(height <= MAXSHORT);
+
+	if (kgem->gen <= 30) {
+		if (tiling) {
+			if (kgem->gen < 30) {
+				tile_width = 128;
+				tile_height = 16;
+			} else {
+				tile_width = 512;
+				tile_height =  8;
+			}
+		} else {
+			tile_width = 2 * bpp >> 3;
+			tile_width = ALIGN(tile_width,
+					   kgem_pitch_alignment(kgem, flags));
+			tile_height = 2;
+		}
+	} else switch (tiling) {
+	default:
+	case I915_TILING_NONE:
+		tile_width = 2 * bpp >> 3;
+		tile_width = ALIGN(tile_width,
+				   kgem_pitch_alignment(kgem, flags));
+		tile_height = 2;
+		break;
+	case I915_TILING_X:
+		tile_width = 512;
+		tile_height = 8;
+		break;
+	case I915_TILING_Y:
+		tile_width = 128;
+		tile_height = 32;
+		break;
+	}
+
+	*pitch = ALIGN(width * bpp / 8, tile_width);
+	height = ALIGN(height, tile_height);
+	if (kgem->gen >= 40)
+		return PAGE_ALIGN(*pitch * height);
+
+	/* If it is too wide for the blitter, don't even bother.  */
+	if (tiling != I915_TILING_NONE) {
+		if (*pitch > 8192)
+			return 0;
+
+		for (size = tile_width; size < *pitch; size <<= 1)
+			;
+		*pitch = size;
+	} else {
+		if (*pitch >= 32768)
+			return 0;
+	}
+
+	size = *pitch * height;
+	if (relaxed_fencing || tiling == I915_TILING_NONE)
+		return PAGE_ALIGN(size);
+
+	/*  We need to allocate a pot fence region for a tiled buffer. */
+	if (kgem->gen < 30)
+		tile_width = 512 * 1024;
+	else
+		tile_width = 1024 * 1024;
+	while (tile_width < size)
+		tile_width *= 2;
+	return tile_width;
+}
+
+static uint32_t kgem_aligned_height(struct kgem *kgem,
+				    uint32_t height, uint32_t tiling)
+{
+	uint32_t tile_height;
+
+	if (kgem->gen <= 30) {
+		tile_height = tiling ? kgem->gen < 30 ? 16 : 8 : 1;
+	} else switch (tiling) {
+	default:
+	case I915_TILING_NONE:
+		tile_height = 2;
+		break;
+	case I915_TILING_X:
+		tile_height = 8;
+		break;
+	case I915_TILING_Y:
+		tile_height = 32;
+		break;
+	}
+
+	return ALIGN(height, tile_height);
+}
+
+static struct drm_i915_gem_exec_object2 *
+kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_exec_object2 *exec;
+
+	DBG(("%s: handle=%d, index=%d\n",
+	     __FUNCTION__, bo->handle, kgem->nexec));
+
+	assert(kgem->nexec < ARRAY_SIZE(kgem->exec));
+	exec = memset(&kgem->exec[kgem->nexec++], 0, sizeof(*exec));
+	exec->handle = bo->handle;
+	exec->offset = bo->presumed_offset;
+
+	kgem->aperture += num_pages(bo);
+
+	return exec;
+}
+
+void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
+{
+	bo->exec = kgem_add_handle(kgem, bo);
+	bo->rq = kgem->next_request;
+
+	list_move_tail(&bo->request, &kgem->next_request->buffers);
+
+	/* XXX is it worth working around gcc here? */
+	kgem->flush |= bo->flush;
+}
+
+static uint32_t kgem_end_batch(struct kgem *kgem)
+{
+	kgem->batch[kgem->nbatch++] = MI_BATCH_BUFFER_END;
+	if (kgem->nbatch & 1)
+		kgem->batch[kgem->nbatch++] = MI_NOOP;
+
+	return kgem->nbatch;
+}
+
+static void kgem_fixup_self_relocs(struct kgem *kgem, struct kgem_bo *bo)
+{
+	int n;
+
+	for (n = 0; n < kgem->nreloc; n++) {
+		if (kgem->reloc[n].target_handle == 0) {
+			kgem->reloc[n].target_handle = bo->handle;
+			kgem->reloc[n].presumed_offset = bo->presumed_offset;
+			kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+				kgem->reloc[n].delta + bo->presumed_offset;
+		}
+	}
+}
+
+static void kgem_bo_binding_free(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct kgem_bo_binding *b;
+
+	b = bo->binding.next;
+	while (b) {
+		struct kgem_bo_binding *next = b->next;
+		free (b);
+		b = next;
+	}
+}
+
+static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+	int type = IS_CPU_MAP(bo->map);
+
+	assert(!IS_USER_MAP(bo->map));
+
+	DBG(("%s: releasing %s vma for handle=%d, count=%d\n",
+	     __FUNCTION__, type ? "CPU" : "GTT",
+	     bo->handle, kgem->vma[type].count));
+
+	VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
+	munmap(MAP(bo->map), bytes(bo));
+	bo->map = NULL;
+
+	if (!list_is_empty(&bo->vma)) {
+		list_del(&bo->vma);
+		kgem->vma[type].count--;
+	}
+}
+
+static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+	assert(bo->refcnt == 0);
+	assert(bo->exec == NULL);
+	assert(!bo->snoop || bo->rq == NULL);
+
+#ifdef DEBUG_MEMORY
+	kgem->debug_memory.bo_allocs--;
+	kgem->debug_memory.bo_bytes -= bytes(bo);
+#endif
+
+	kgem_bo_binding_free(kgem, bo);
+
+	if (IS_USER_MAP(bo->map)) {
+		assert(bo->rq == NULL);
+		assert(MAP(bo->map) != bo || bo->io);
+		if (bo != MAP(bo->map)) {
+			DBG(("%s: freeing snooped base\n", __FUNCTION__));
+			free(MAP(bo->map));
+		}
+		bo->map = NULL;
+	}
+	if (bo->map)
+		kgem_bo_release_map(kgem, bo);
+	assert(list_is_empty(&bo->vma));
+
+	_list_del(&bo->list);
+	_list_del(&bo->request);
+	gem_close(kgem->fd, bo->handle);
+
+	if (!bo->io) {
+		*(struct kgem_bo **)bo = __kgem_freed_bo;
+		__kgem_freed_bo = bo;
+	} else
+		free(bo);
+}
+
+inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
+					    struct kgem_bo *bo)
+{
+	DBG(("%s: moving handle=%d to inactive\n", __FUNCTION__, bo->handle));
+
+	assert(bo->refcnt == 0);
+	assert(bo->reusable);
+	assert(bo->rq == NULL);
+	assert(bo->exec == NULL);
+	assert(bo->domain != DOMAIN_GPU);
+	assert(!kgem_busy(kgem, bo->handle));
+	assert(!bo->proxy);
+	assert(!bo->io);
+	assert(!bo->needs_flush);
+	assert(list_is_empty(&bo->vma));
+
+	kgem->need_expire = true;
+
+	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
+		list_move(&bo->list, &kgem->large_inactive);
+		return;
+	}
+
+	assert(bo->flush == false);
+	list_move(&bo->list, &kgem->inactive[bucket(bo)]);
+	if (bo->map) {
+		int type = IS_CPU_MAP(bo->map);
+		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
+		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
+			munmap(MAP(bo->map), bytes(bo));
+			bo->map = NULL;
+		}
+		if (bo->map) {
+			list_add(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
+			kgem->vma[type].count++;
+		}
+	}
+}
+
+inline static void kgem_bo_remove_from_inactive(struct kgem *kgem,
+						struct kgem_bo *bo)
+{
+	DBG(("%s: removing handle=%d from inactive\n", __FUNCTION__, bo->handle));
+
+	list_del(&bo->list);
+	assert(bo->rq == NULL);
+	assert(bo->exec == NULL);
+	if (bo->map) {
+		assert(!list_is_empty(&bo->vma));
+		list_del(&bo->vma);
+		kgem->vma[IS_CPU_MAP(bo->map)].count--;
+	}
+}
+
+inline static void kgem_bo_remove_from_active(struct kgem *kgem,
+					      struct kgem_bo *bo)
+{
+	DBG(("%s: removing handle=%d from active\n", __FUNCTION__, bo->handle));
+
+	list_del(&bo->list);
+	assert(bo->rq != NULL);
+	if (bo->rq == &_kgem_static_request)
+		list_del(&bo->request);
+	assert(list_is_empty(&bo->vma));
+}
+
+static void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (!bo->scanout)
+		return;
+
+	assert(bo->proxy == NULL);
+
+	DBG(("%s: handle=%d, fb=%d (reusable=%d)\n",
+	     __FUNCTION__, bo->handle, bo->delta, bo->reusable));
+	if (bo->delta) {
+		/* XXX will leak if we are not DRM_MASTER. *shrug* */
+		drmModeRmFB(kgem->fd, bo->delta);
+		bo->delta = 0;
+	}
+
+	bo->scanout = false;
+	bo->needs_flush = true;
+	bo->flush = false;
+	bo->reusable = true;
+
+	if (kgem->has_llc &&
+	    !gem_set_cacheing(kgem->fd, bo->handle, SNOOPED))
+		bo->reusable = false;
+}
+
+static void _kgem_bo_delete_buffer(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct kgem_buffer *io = (struct kgem_buffer *)bo->proxy;
+
+	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
+	     __FUNCTION__, bo->size.bytes, bo->delta, io->used));
+
+	if (ALIGN(bo->delta + bo->size.bytes, UPLOAD_ALIGNMENT) == io->used)
+		io->used = bo->delta;
+}
+
+static void kgem_bo_move_to_snoop(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->refcnt == 0);
+	assert(bo->exec == NULL);
+
+	if (num_pages(bo) > kgem->max_cpu_size >> 13) {
+		DBG(("%s handle=%d discarding large CPU buffer (%d >%d pages)\n",
+		     __FUNCTION__, bo->handle, num_pages(bo), kgem->max_cpu_size >> 13));
+		kgem_bo_free(kgem, bo);
+		return;
+	}
+
+	assert(bo->tiling == I915_TILING_NONE);
+	assert(bo->rq == NULL);
+
+	DBG(("%s: moving %d to snoop cachee\n", __FUNCTION__, bo->handle));
+	list_add(&bo->list, &kgem->snoop);
+}
+
+static struct kgem_bo *
+search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
+{
+	struct kgem_bo *bo, *first = NULL;
+
+	DBG(("%s: num_pages=%d, flags=%x\n", __FUNCTION__, num_pages, flags));
+
+	if ((kgem->has_cacheing | kgem->has_userptr) == 0)
+		return NULL;
+
+	if (list_is_empty(&kgem->snoop)) {
+		DBG(("%s: inactive and cache empty\n", __FUNCTION__));
+		if (!__kgem_throttle_retire(kgem, flags)) {
+			DBG(("%s: nothing retired\n", __FUNCTION__));
+			return NULL;
+		}
+	}
+
+	list_for_each_entry(bo, &kgem->snoop, list) {
+		assert(bo->refcnt == 0);
+		assert(bo->snoop);
+		assert(bo->proxy == NULL);
+		assert(bo->tiling == I915_TILING_NONE);
+		assert(bo->rq == NULL);
+		assert(bo->exec == NULL);
+
+		if (num_pages > num_pages(bo))
+			continue;
+
+		if (num_pages(bo) > 2*num_pages) {
+			if (first == NULL)
+				first = bo;
+			continue;
+		}
+
+		list_del(&bo->list);
+		bo->pitch = 0;
+		bo->delta = 0;
+
+		DBG(("  %s: found handle=%d (num_pages=%d) in snoop cache\n",
+		     __FUNCTION__, bo->handle, num_pages(bo)));
+		return bo;
+	}
+
+	if (first) {
+		list_del(&first->list);
+		first->pitch = 0;
+		first->delta = 0;
+
+		DBG(("  %s: found handle=%d (num_pages=%d) in snoop cache\n",
+		     __FUNCTION__, first->handle, num_pages(first)));
+		return first;
+	}
+
+	return NULL;
+}
+
+static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+
+	assert(list_is_empty(&bo->list));
+	assert(bo->refcnt == 0);
+	assert(!bo->purged);
+	assert(bo->proxy == NULL);
+
+	bo->binding.offset = 0;
+	kgem_bo_clear_scanout(kgem, bo);
+
+	if (DBG_NO_CACHE)
+		goto destroy;
+
+	if (bo->snoop && !bo->flush) {
+		DBG(("%s: handle=%d is snooped\n", __FUNCTION__, bo->handle));
+		assert(!bo->flush);
+		assert(list_is_empty(&bo->list));
+		if (bo->rq == NULL) {
+			if (bo->needs_flush && kgem_busy(kgem, bo->handle)) {
+				DBG(("%s: handle=%d is snooped, tracking until free\n",
+				     __FUNCTION__, bo->handle));
+				list_add(&bo->request, &kgem->flushing);
+				bo->rq = &_kgem_static_request;
+			}
+		}
+		if (bo->rq == NULL)
+			kgem_bo_move_to_snoop(kgem, bo);
+		return;
+	}
+
+	if (bo->io) {
+		struct kgem_bo *base;
+
+		assert(!bo->snoop);
+		base = malloc(sizeof(*base));
+		if (base) {
+			DBG(("%s: transferring io handle=%d to bo\n",
+			     __FUNCTION__, bo->handle));
+			/* transfer the handle to a minimum bo */
+			memcpy(base, bo, sizeof(*base));
+			base->io = false;
+			list_init(&base->list);
+			list_replace(&bo->request, &base->request);
+			list_replace(&bo->vma, &base->vma);
+			free(bo);
+			bo = base;
+		} else
+			bo->reusable = false;
+	}
+
+	if (!bo->reusable) {
+		DBG(("%s: handle=%d, not reusable\n",
+		     __FUNCTION__, bo->handle));
+		goto destroy;
+	}
+
+	if (!kgem->has_llc && IS_CPU_MAP(bo->map) && bo->domain != DOMAIN_CPU)
+		kgem_bo_release_map(kgem, bo);
+
+	assert(list_is_empty(&bo->vma));
+	assert(list_is_empty(&bo->list));
+	assert(bo->snoop == false);
+	assert(bo->io == false);
+	assert(bo->scanout == false);
+
+	if (bo->rq) {
+		struct list *cache;
+
+		DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle));
+		if (bucket(bo) < NUM_CACHE_BUCKETS)
+			cache = &kgem->active[bucket(bo)][bo->tiling];
+		else
+			cache = &kgem->large;
+		list_add(&bo->list, cache);
+		return;
+	}
+
+	assert(bo->exec == NULL);
+	assert(list_is_empty(&bo->request));
+
+	if (bo->needs_flush) {
+		if ((bo->needs_flush = kgem_busy(kgem, bo->handle))) {
+			struct list *cache;
+
+			DBG(("%s: handle=%d -> flushing\n",
+			     __FUNCTION__, bo->handle));
+
+			list_add(&bo->request, &kgem->flushing);
+			if (bucket(bo) < NUM_CACHE_BUCKETS)
+				cache = &kgem->active[bucket(bo)][bo->tiling];
+			else
+				cache = &kgem->large;
+			list_add(&bo->list, cache);
+			bo->rq = &_kgem_static_request;
+			return;
+		}
+
+		bo->domain = DOMAIN_NONE;
+	}
+
+	if (!IS_CPU_MAP(bo->map)) {
+		if (!kgem_bo_set_purgeable(kgem, bo))
+			goto destroy;
+
+		if (!kgem->has_llc && bo->domain == DOMAIN_CPU)
+			goto destroy;
+
+		DBG(("%s: handle=%d, purged\n",
+		     __FUNCTION__, bo->handle));
+	}
+
+	kgem_bo_move_to_inactive(kgem, bo);
+	return;
+
+destroy:
+	if (!bo->exec)
+		kgem_bo_free(kgem, bo);
+}
+
+static void kgem_bo_unref(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->refcnt);
+	if (--bo->refcnt == 0)
+		__kgem_bo_destroy(kgem, bo);
+}
+
+static void kgem_buffer_release(struct kgem *kgem, struct kgem_buffer *bo)
+{
+	while (!list_is_empty(&bo->base.vma)) {
+		struct kgem_bo *cached;
+
+		cached = list_first_entry(&bo->base.vma, struct kgem_bo, vma);
+		assert(cached->proxy == &bo->base);
+		list_del(&cached->vma);
+
+		assert(*(struct kgem_bo **)cached->map == cached);
+		*(struct kgem_bo **)cached->map = NULL;
+		cached->map = NULL;
+
+		kgem_bo_destroy(kgem, cached);
+	}
+}
+
+static bool kgem_retire__buffers(struct kgem *kgem)
+{
+	bool retired = false;
+
+	while (!list_is_empty(&kgem->active_buffers)) {
+		struct kgem_buffer *bo =
+			list_last_entry(&kgem->active_buffers,
+					struct kgem_buffer,
+					base.list);
+
+		if (bo->base.rq)
+			break;
+
+		DBG(("%s: releasing upload cache for handle=%d? %d\n",
+		     __FUNCTION__, bo->base.handle, !list_is_empty(&bo->base.vma)));
+		list_del(&bo->base.list);
+		kgem_buffer_release(kgem, bo);
+		kgem_bo_unref(kgem, &bo->base);
+		retired = true;
+	}
+
+	return retired;
+}
+
+static bool kgem_retire__flushing(struct kgem *kgem)
+{
+	struct kgem_bo *bo, *next;
+	bool retired = false;
+
+	list_for_each_entry_safe(bo, next, &kgem->flushing, request) {
+		assert(bo->rq == &_kgem_static_request);
+		assert(bo->exec == NULL);
+
+		if (kgem_busy(kgem, bo->handle))
+			break;
+
+		bo->needs_flush = false;
+		bo->domain = DOMAIN_NONE;
+		bo->rq = NULL;
+		list_del(&bo->request);
+
+		if (!bo->refcnt) {
+			if (bo->snoop) {
+				kgem_bo_move_to_snoop(kgem, bo);
+			} else if (kgem_bo_set_purgeable(kgem, bo)) {
+				assert(bo->reusable);
+				kgem_bo_move_to_inactive(kgem, bo);
+				retired = true;
+			} else
+				kgem_bo_free(kgem, bo);
+		}
+	}
+#if HAS_DEBUG_FULL
+	{
+		int count = 0;
+		list_for_each_entry(bo, &kgem->flushing, request)
+			count++;
+		ErrorF("%s: %d bo on flushing list\n", __FUNCTION__, count);
+	}
+#endif
+
+	return retired;
+}
+
+static bool kgem_retire__requests(struct kgem *kgem)
+{
+	struct kgem_bo *bo;
+	bool retired = false;
+	int n;
+
+	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+		while (!list_is_empty(&kgem->requests[n])) {
+			struct kgem_request *rq;
+
+			rq = list_first_entry(&kgem->requests[n],
+					      struct kgem_request,
+					      list);
+			if (kgem_busy(kgem, rq->bo->handle))
+				break;
+
+			DBG(("%s: request %d complete\n",
+			     __FUNCTION__, rq->bo->handle));
+
+			while (!list_is_empty(&rq->buffers)) {
+				bo = list_first_entry(&rq->buffers,
+						      struct kgem_bo,
+						      request);
+
+				assert(bo->rq == rq);
+				assert(bo->exec == NULL);
+				assert(bo->domain == DOMAIN_GPU);
+
+				list_del(&bo->request);
+
+				if (bo->needs_flush)
+					bo->needs_flush = kgem_busy(kgem, bo->handle);
+				if (bo->needs_flush) {
+					DBG(("%s: moving %d to flushing\n",
+					     __FUNCTION__, bo->handle));
+					list_add(&bo->request, &kgem->flushing);
+					bo->rq = &_kgem_static_request;
+				} else {
+					bo->domain = DOMAIN_NONE;
+					bo->rq = NULL;
+				}
+
+				if (bo->refcnt)
+					continue;
+
+				if (bo->snoop) {
+					if (bo->needs_flush) {
+						list_add(&bo->request, &kgem->flushing);
+						bo->rq = &_kgem_static_request;
+					} else {
+						kgem_bo_move_to_snoop(kgem, bo);
+					}
+					continue;
+				}
+
+				if (!bo->reusable) {
+					DBG(("%s: closing %d\n",
+					     __FUNCTION__, bo->handle));
+					kgem_bo_free(kgem, bo);
+					continue;
+				}
+
+				if (!bo->needs_flush) {
+					if (kgem_bo_set_purgeable(kgem, bo)) {
+						kgem_bo_move_to_inactive(kgem, bo);
+						retired = true;
+					} else {
+						DBG(("%s: closing %d\n",
+						     __FUNCTION__, bo->handle));
+						kgem_bo_free(kgem, bo);
+					}
+				}
+			}
+
+			assert(rq->bo->rq == NULL);
+			assert(list_is_empty(&rq->bo->request));
+
+			if (--rq->bo->refcnt == 0) {
+				if (kgem_bo_set_purgeable(kgem, rq->bo)) {
+					kgem_bo_move_to_inactive(kgem, rq->bo);
+					retired = true;
+				} else {
+					DBG(("%s: closing %d\n",
+					     __FUNCTION__, rq->bo->handle));
+					kgem_bo_free(kgem, rq->bo);
+				}
+			}
+
+			__kgem_request_free(rq);
+			kgem->num_requests--;
+		}
+
+#if HAS_DEBUG_FULL
+		{
+			int count = 0;
+
+			list_for_each_entry(bo, &kgem->requests[n], request)
+				count++;
+
+			bo = NULL;
+			if (!list_is_empty(&kgem->requests[n]))
+				bo = list_first_entry(&kgem->requests[n],
+						      struct kgem_request,
+						      list)->bo;
+
+			ErrorF("%s: ring=%d, %d outstanding requests, oldest=%d\n",
+			       __FUNCTION__, n, count, bo ? bo->handle : 0);
+		}
+#endif
+	}
+
+#if HAS_DEBUG_FULL
+	{
+		int count = 0;
+
+		for (n = 0; n < ARRAY_SIZE(kgem->requests); n++)
+			list_for_each_entry(bo, &kgem->requests[n], request)
+				count++;
+
+		assert(count == kgem->num_requests);
+	}
+#endif
+
+	return retired;
+}
+
+bool kgem_retire(struct kgem *kgem)
+{
+	bool retired = false;
+
+	DBG(("%s\n", __FUNCTION__));
+
+	retired |= kgem_retire__flushing(kgem);
+	if (kgem->num_requests)
+		retired |= kgem_retire__requests(kgem);
+	retired |= kgem_retire__buffers(kgem);
+
+	kgem->need_retire =
+		kgem->num_requests ||
+		!list_is_empty(&kgem->flushing);
+	DBG(("%s -- retired=%d, need_retire=%d\n",
+	     __FUNCTION__, retired, kgem->need_retire));
+
+	kgem->retire(kgem);
+
+	return retired;
+}
+
+bool __kgem_is_idle(struct kgem *kgem)
+{
+	int n;
+
+	assert(kgem->num_requests);
+
+	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+		struct kgem_request *rq;
+
+		if (list_is_empty(&kgem->requests[n]))
+			continue;
+
+		rq = list_last_entry(&kgem->requests[n],
+				     struct kgem_request, list);
+		if (kgem_busy(kgem, rq->bo->handle)) {
+			DBG(("%s: last requests handle=%d still busy\n",
+			     __FUNCTION__, rq->bo->handle));
+			return false;
+		}
+
+		DBG(("%s: ring=%d idle (handle=%d)\n",
+		     __FUNCTION__, n, rq->bo->handle));
+	}
+	kgem_retire__requests(kgem);
+	assert(kgem->num_requests == 0);
+	return true;
+}
+
+static void kgem_commit(struct kgem *kgem)
+{
+	struct kgem_request *rq = kgem->next_request;
+	struct kgem_bo *bo, *next;
+
+	list_for_each_entry_safe(bo, next, &rq->buffers, request) {
+		assert(next->request.prev == &bo->request);
+
+		DBG(("%s: release handle=%d (proxy? %d), dirty? %d flush? %d, snoop? %d -> offset=%x\n",
+		     __FUNCTION__, bo->handle, bo->proxy != NULL,
+		     bo->dirty, bo->needs_flush, bo->snoop,
+		     (unsigned)bo->exec->offset));
+
+		assert(!bo->purged);
+		assert(bo->exec);
+		assert(bo->proxy == NULL || bo->exec == &_kgem_dummy_exec);
+		assert(bo->rq == rq || (bo->proxy->rq == rq));
+
+		bo->presumed_offset = bo->exec->offset;
+		bo->exec = NULL;
+
+		if (!bo->refcnt && !bo->reusable) {
+			assert(!bo->snoop);
+			kgem_bo_free(kgem, bo);
+			continue;
+		}
+
+		bo->binding.offset = 0;
+		bo->domain = DOMAIN_GPU;
+		bo->dirty = false;
+
+		if (bo->proxy) {
+			/* proxies are not used for domain tracking */
+			list_del(&bo->request);
+			bo->rq = NULL;
+			bo->exec = NULL;
+		}
+	}
+
+	if (rq == &_kgem_static_request) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: syncing due to allocation failure\n", __FUNCTION__));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = rq->bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain)) {
+			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+			kgem_throttle(kgem);
+		}
+
+		kgem_retire(kgem);
+		assert(list_is_empty(&rq->buffers));
+
+		gem_close(kgem->fd, rq->bo->handle);
+	} else {
+		list_add_tail(&rq->list, &kgem->requests[rq->ring]);
+		kgem->need_throttle = kgem->need_retire = 1;
+		kgem->num_requests++;
+	}
+
+	kgem->next_request = NULL;
+}
+
+static void kgem_close_list(struct kgem *kgem, struct list *head)
+{
+	while (!list_is_empty(head))
+		kgem_bo_free(kgem, list_first_entry(head, struct kgem_bo, list));
+}
+
+static void kgem_close_inactive(struct kgem *kgem)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
+		kgem_close_list(kgem, &kgem->inactive[i]);
+}
+
+static void kgem_finish_buffers(struct kgem *kgem)
+{
+	struct kgem_buffer *bo, *next;
+
+	list_for_each_entry_safe(bo, next, &kgem->batch_buffers, base.list) {
+		DBG(("%s: buffer handle=%d, used=%d, exec?=%d, write=%d, mmapped=%d\n",
+		     __FUNCTION__, bo->base.handle, bo->used, bo->base.exec!=NULL,
+		     bo->write, bo->mmapped));
+
+		assert(next->base.list.prev == &bo->base.list);
+		assert(bo->base.io);
+		assert(bo->base.refcnt >= 1);
+
+		if (!bo->base.exec) {
+			DBG(("%s: skipping unattached handle=%d, used=%d\n",
+			     __FUNCTION__, bo->base.handle, bo->used));
+			continue;
+		}
+
+		if (!bo->write) {
+			assert(bo->base.exec || bo->base.refcnt > 1);
+			goto decouple;
+		}
+
+		if (bo->mmapped) {
+			int used;
+
+			assert(!bo->need_io);
+
+			used = ALIGN(bo->used + PAGE_SIZE-1, PAGE_SIZE);
+			if (!DBG_NO_UPLOAD_ACTIVE &&
+			    used + PAGE_SIZE <= bytes(&bo->base) &&
+			    (kgem->has_llc || !IS_CPU_MAP(bo->base.map))) {
+				DBG(("%s: retaining upload buffer (%d/%d)\n",
+				     __FUNCTION__, bo->used, bytes(&bo->base)));
+				assert(!bo->base.snoop);
+				bo->used = used;
+				list_move(&bo->base.list,
+					  &kgem->active_buffers);
+				continue;
+			}
+			DBG(("%s: discarding mmapped buffer, used=%d, map type=%d\n",
+			     __FUNCTION__, bo->used, (int)__MAP_TYPE(bo->base.map)));
+			goto decouple;
+		}
+
+		if (!bo->used) {
+			/* Unless we replace the handle in the execbuffer,
+			 * then this bo will become active. So decouple it
+			 * from the buffer list and track it in the normal
+			 * manner.
+			 */
+			goto decouple;
+		}
+
+		assert(bo->need_io);
+		assert(bo->base.rq == kgem->next_request);
+		assert(bo->base.domain != DOMAIN_GPU);
+
+		if (bo->base.refcnt == 1 &&
+		    bo->base.size.pages.count > 1 &&
+		    bo->used < bytes(&bo->base) / 2) {
+			struct kgem_bo *shrink;
+
+			shrink = search_linear_cache(kgem,
+						     PAGE_ALIGN(bo->used),
+						     CREATE_INACTIVE | CREATE_NO_RETIRE);
+			if (shrink) {
+				int n;
+
+				DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
+				     __FUNCTION__,
+				     bo->used, bytes(&bo->base), bytes(shrink),
+				     bo->base.handle, shrink->handle));
+
+				assert(bo->used <= bytes(shrink));
+				gem_write(kgem->fd, shrink->handle,
+					  0, bo->used, bo->mem);
+
+				for (n = 0; n < kgem->nreloc; n++) {
+					if (kgem->reloc[n].target_handle == bo->base.handle) {
+						kgem->reloc[n].target_handle = shrink->handle;
+						kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+						kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+							kgem->reloc[n].delta + shrink->presumed_offset;
+					}
+				}
+
+				bo->base.exec->handle = shrink->handle;
+				bo->base.exec->offset = shrink->presumed_offset;
+				shrink->exec = bo->base.exec;
+				shrink->rq = bo->base.rq;
+				list_replace(&bo->base.request,
+					     &shrink->request);
+				list_init(&bo->base.request);
+				shrink->needs_flush = bo->base.dirty;
+
+				bo->base.exec = NULL;
+				bo->base.rq = NULL;
+				bo->base.dirty = false;
+				bo->base.needs_flush = false;
+				bo->used = 0;
+
+				goto decouple;
+			}
+		}
+
+		DBG(("%s: handle=%d, uploading %d/%d\n",
+		     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
+		assert(!kgem_busy(kgem, bo->base.handle));
+		assert(bo->used <= bytes(&bo->base));
+		gem_write(kgem->fd, bo->base.handle,
+			  0, bo->used, bo->mem);
+		bo->need_io = 0;
+
+decouple:
+		DBG(("%s: releasing handle=%d\n",
+		     __FUNCTION__, bo->base.handle));
+		list_del(&bo->base.list);
+		kgem_bo_unref(kgem, &bo->base);
+	}
+}
+
+static void kgem_cleanup(struct kgem *kgem)
+{
+	int n;
+
+	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+		while (!list_is_empty(&kgem->requests[n])) {
+			struct kgem_request *rq;
+
+			rq = list_first_entry(&kgem->requests[n],
+					      struct kgem_request,
+					      list);
+			while (!list_is_empty(&rq->buffers)) {
+				struct kgem_bo *bo;
+
+				bo = list_first_entry(&rq->buffers,
+						      struct kgem_bo,
+						      request);
+
+				list_del(&bo->request);
+				bo->rq = NULL;
+				bo->exec = NULL;
+				bo->domain = DOMAIN_NONE;
+				bo->dirty = false;
+				if (bo->refcnt == 0)
+					kgem_bo_free(kgem, bo);
+			}
+
+			__kgem_request_free(rq);
+		}
+	}
+
+	kgem->num_requests = 0;
+	kgem_close_inactive(kgem);
+}
+
+static int kgem_batch_write(struct kgem *kgem, uint32_t handle, uint32_t size)
+{
+	int ret;
+
+	assert(!kgem_busy(kgem, handle));
+
+	/* If there is no surface data, just upload the batch */
+	if (kgem->surface == kgem->batch_size)
+		return gem_write(kgem->fd, handle,
+				 0, sizeof(uint32_t)*kgem->nbatch,
+				 kgem->batch);
+
+	/* Are the batch pages conjoint with the surface pages? */
+	if (kgem->surface < kgem->nbatch + PAGE_SIZE/sizeof(uint32_t)) {
+		assert(size == PAGE_ALIGN(kgem->batch_size*sizeof(uint32_t)));
+		return gem_write(kgem->fd, handle,
+				 0, kgem->batch_size*sizeof(uint32_t),
+				 kgem->batch);
+	}
+
+	/* Disjoint surface/batch, upload separately */
+	ret = gem_write(kgem->fd, handle,
+			0, sizeof(uint32_t)*kgem->nbatch,
+			kgem->batch);
+	if (ret)
+		return ret;
+
+	ret = PAGE_ALIGN(sizeof(uint32_t) * kgem->batch_size);
+	ret -= sizeof(uint32_t) * kgem->surface;
+	assert(size-ret >= kgem->nbatch*sizeof(uint32_t));
+	return __gem_write(kgem->fd, handle,
+			size - ret, (kgem->batch_size - kgem->surface)*sizeof(uint32_t),
+			kgem->batch + kgem->surface);
+}
+
+void kgem_reset(struct kgem *kgem)
+{
+	if (kgem->next_request) {
+		struct kgem_request *rq = kgem->next_request;
+
+		while (!list_is_empty(&rq->buffers)) {
+			struct kgem_bo *bo =
+				list_first_entry(&rq->buffers,
+						 struct kgem_bo,
+						 request);
+			list_del(&bo->request);
+
+			bo->binding.offset = 0;
+			bo->exec = NULL;
+			bo->dirty = false;
+			bo->rq = NULL;
+			bo->domain = DOMAIN_NONE;
+
+			if (!bo->refcnt) {
+				DBG(("%s: discarding handle=%d\n",
+				     __FUNCTION__, bo->handle));
+				kgem_bo_free(kgem, bo);
+			}
+		}
+
+		if (kgem->next_request != &_kgem_static_request)
+			free(kgem->next_request);
+	}
+
+	kgem->nfence = 0;
+	kgem->nexec = 0;
+	kgem->nreloc = 0;
+	kgem->aperture = 0;
+	kgem->aperture_fenced = 0;
+	kgem->nbatch = 0;
+	kgem->surface = kgem->batch_size;
+	kgem->mode = KGEM_NONE;
+	kgem->batch_flags = 0;
+	kgem->flush = 0;
+
+	kgem->next_request = __kgem_request_alloc();
+
+	kgem_sna_reset(kgem);
+}
+
+static int compact_batch_surface(struct kgem *kgem)
+{
+	int size, shrink, n;
+
+	if (!kgem->has_relaxed_delta)
+		return kgem->batch_size;
+
+	/* See if we can pack the contents into one or two pages */
+	n = ALIGN(kgem->batch_size, 1024);
+	size = n - kgem->surface + kgem->nbatch;
+	size = ALIGN(size, 1024);
+
+	shrink = n - size;
+	if (shrink) {
+		DBG(("shrinking from %d to %d\n", kgem->batch_size, size));
+
+		shrink *= sizeof(uint32_t);
+		for (n = 0; n < kgem->nreloc; n++) {
+			if (kgem->reloc[n].read_domains == I915_GEM_DOMAIN_INSTRUCTION &&
+			    kgem->reloc[n].target_handle == 0)
+				kgem->reloc[n].delta -= shrink;
+
+			if (kgem->reloc[n].offset >= sizeof(uint32_t)*kgem->nbatch)
+				kgem->reloc[n].offset -= shrink;
+		}
+	}
+
+	return size * sizeof(uint32_t);
+}
+
+void _kgem_submit(struct kgem *kgem)
+{
+	struct kgem_request *rq;
+	uint32_t batch_end;
+	int size;
+
+	assert(!DBG_NO_HW);
+	assert(!kgem->wedged);
+
+	assert(kgem->nbatch);
+	assert(kgem->nbatch <= KGEM_BATCH_SIZE(kgem));
+	assert(kgem->nbatch <= kgem->surface);
+
+	batch_end = kgem_end_batch(kgem);
+	kgem_sna_flush(kgem);
+
+	DBG(("batch[%d/%d]: %d %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d\n",
+	     kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface, kgem->batch_size,
+	     kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture));
+
+	assert(kgem->nbatch <= kgem->batch_size);
+	assert(kgem->nbatch <= kgem->surface);
+	assert(kgem->nreloc <= ARRAY_SIZE(kgem->reloc));
+	assert(kgem->nexec < ARRAY_SIZE(kgem->exec));
+	assert(kgem->nfence <= kgem->fence_max);
+
+	kgem_finish_buffers(kgem);
+
+#if HAS_DEBUG_FULL && SHOW_BATCH
+	__kgem_batch_debug(kgem, batch_end);
+#endif
+
+	rq = kgem->next_request;
+	if (kgem->surface != kgem->batch_size)
+		size = compact_batch_surface(kgem);
+	else
+		size = kgem->nbatch * sizeof(kgem->batch[0]);
+	rq->bo = kgem_create_linear(kgem, size, CREATE_NO_THROTTLE);
+	if (rq->bo) {
+		uint32_t handle = rq->bo->handle;
+		int i;
+
+		assert(!rq->bo->needs_flush);
+
+		i = kgem->nexec++;
+		kgem->exec[i].handle = handle;
+		kgem->exec[i].relocation_count = kgem->nreloc;
+		kgem->exec[i].relocs_ptr = (uintptr_t)kgem->reloc;
+		kgem->exec[i].alignment = 0;
+		kgem->exec[i].offset = 0;
+		kgem->exec[i].flags = 0;
+		kgem->exec[i].rsvd1 = 0;
+		kgem->exec[i].rsvd2 = 0;
+
+		rq->bo->exec = &kgem->exec[i];
+		rq->bo->rq = rq; /* useful sanity check */
+		list_add(&rq->bo->request, &rq->buffers);
+		rq->ring = kgem->ring == KGEM_BLT;
+
+		kgem_fixup_self_relocs(kgem, rq->bo);
+
+		if (kgem_batch_write(kgem, handle, size) == 0) {
+			struct drm_i915_gem_execbuffer2 execbuf;
+			int ret, retry = 3;
+
+			VG_CLEAR(execbuf);
+			execbuf.buffers_ptr = (uintptr_t)kgem->exec;
+			execbuf.buffer_count = kgem->nexec;
+			execbuf.batch_start_offset = 0;
+			execbuf.batch_len = batch_end*sizeof(uint32_t);
+			execbuf.cliprects_ptr = 0;
+			execbuf.num_cliprects = 0;
+			execbuf.DR1 = 0;
+			execbuf.DR4 = 0;
+			execbuf.flags = kgem->ring | kgem->batch_flags;
+			execbuf.rsvd1 = 0;
+			execbuf.rsvd2 = 0;
+
+			if (DBG_DUMP) {
+				int fd = open("/tmp/i915-batchbuffers.dump",
+					      O_WRONLY | O_CREAT | O_APPEND,
+					      0666);
+				if (fd != -1) {
+					ret = write(fd, kgem->batch, batch_end*sizeof(uint32_t));
+					fd = close(fd);
+				}
+			}
+
+			ret = drmIoctl(kgem->fd,
+				       DRM_IOCTL_I915_GEM_EXECBUFFER2,
+				       &execbuf);
+			while (ret == -1 && errno == EBUSY && retry--) {
+				__kgem_throttle(kgem);
+				ret = drmIoctl(kgem->fd,
+					       DRM_IOCTL_I915_GEM_EXECBUFFER2,
+					       &execbuf);
+			}
+			if (ret == -1 && (errno == EIO || errno == EBUSY)) {
+				DBG(("%s: GPU hang detected\n", __FUNCTION__));
+				kgem_throttle(kgem);
+				ret = 0;
+			}
+#if !NDEBUG
+			if (ret < 0) {
+				ret = errno;
+				ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d: errno=%d\n",
+				       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
+				       kgem->nreloc, kgem->nexec, kgem->nfence, kgem->aperture, errno);
+
+				for (i = 0; i < kgem->nexec; i++) {
+					struct kgem_bo *bo, *found = NULL;
+
+					list_for_each_entry(bo, &kgem->next_request->buffers, request) {
+						if (bo->handle == kgem->exec[i].handle) {
+							found = bo;
+							break;
+						}
+					}
+					ErrorF("exec[%d] = handle:%d, presumed offset: %x, size: %d, tiling %d, fenced %d, snooped %d, deleted %d\n",
+					       i,
+					       kgem->exec[i].handle,
+					       (int)kgem->exec[i].offset,
+					       found ? kgem_bo_size(found) : -1,
+					       found ? found->tiling : -1,
+					       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
+					       found ? found->snoop : -1,
+					       found ? found->purged : -1);
+				}
+				for (i = 0; i < kgem->nreloc; i++) {
+					ErrorF("reloc[%d] = pos:%d, target:%d, delta:%d, read:%x, write:%x, offset:%x\n",
+					       i,
+					       (int)kgem->reloc[i].offset,
+					       kgem->reloc[i].target_handle,
+					       kgem->reloc[i].delta,
+					       kgem->reloc[i].read_domains,
+					       kgem->reloc[i].write_domain,
+					       (int)kgem->reloc[i].presumed_offset);
+				}
+
+				i = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
+				if (i != -1) {
+					i = write(i, kgem->batch, batch_end*sizeof(uint32_t));
+					(void)i;
+				}
+
+				FatalError("SNA: failed to submit batchbuffer, errno=%d\n", ret);
+			}
+#endif
+
+			if (DEBUG_FLUSH_SYNC) {
+				struct drm_i915_gem_set_domain set_domain;
+
+				DBG(("%s: debug sync, starting\n", __FUNCTION__));
+
+				VG_CLEAR(set_domain);
+				set_domain.handle = handle;
+				set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+				set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+
+				ret = drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
+				if (ret == -1) {
+					DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+					kgem_throttle(kgem);
+				}
+
+				DBG(("%s: debug sync, completed\n", __FUNCTION__));
+			}
+		}
+
+		kgem_commit(kgem);
+	}
+	if (kgem->wedged)
+		kgem_cleanup(kgem);
+
+	kgem_reset(kgem);
+
+	assert(kgem->next_request != NULL);
+}
+
+void kgem_throttle(struct kgem *kgem)
+{
+	kgem->need_throttle = 0;
+	if (kgem->wedged)
+		return;
+
+	kgem->wedged = __kgem_throttle(kgem);
+	if (kgem->wedged) {
+		xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
+			   "Detected a hung GPU, disabling acceleration.\n");
+		xf86DrvMsg(kgem_get_screen_index(kgem), X_ERROR,
+			   "When reporting this, please include i915_error_state from debugfs and the full dmesg.\n");
+	}
+}
+
+void kgem_purge_cache(struct kgem *kgem)
+{
+	struct kgem_bo *bo, *next;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+		list_for_each_entry_safe(bo, next, &kgem->inactive[i], list) {
+			if (!kgem_bo_is_retained(kgem, bo)) {
+				DBG(("%s: purging %d\n",
+				     __FUNCTION__, bo->handle));
+				kgem_bo_free(kgem, bo);
+			}
+		}
+	}
+
+	kgem->need_purge = false;
+}
+
+bool kgem_expire_cache(struct kgem *kgem)
+{
+	time_t now, expire;
+	struct kgem_bo *bo;
+	unsigned int size = 0, count = 0;
+	bool idle;
+	unsigned int i;
+
+	time(&now);
+
+	while (__kgem_freed_bo) {
+		bo = __kgem_freed_bo;
+		__kgem_freed_bo = *(struct kgem_bo **)bo;
+		free(bo);
+	}
+
+	while (__kgem_freed_request) {
+		struct kgem_request *rq = __kgem_freed_request;
+		__kgem_freed_request = *(struct kgem_request **)rq;
+		free(rq);
+	}
+
+	while (!list_is_empty(&kgem->large_inactive)) {
+		kgem_bo_free(kgem,
+			     list_first_entry(&kgem->large_inactive,
+					      struct kgem_bo, list));
+
+	}
+
+	expire = 0;
+	list_for_each_entry(bo, &kgem->snoop, list) {
+		if (bo->delta) {
+			expire = now - MAX_INACTIVE_TIME/2;
+			break;
+		}
+
+		bo->delta = now;
+	}
+	if (expire) {
+		while (!list_is_empty(&kgem->snoop)) {
+			bo = list_last_entry(&kgem->snoop, struct kgem_bo, list);
+
+			if (bo->delta > expire)
+				break;
+
+			kgem_bo_free(kgem, bo);
+		}
+	}
+#ifdef DEBUG_MEMORY
+	{
+		long snoop_size = 0;
+		int snoop_count = 0;
+		list_for_each_entry(bo, &kgem->snoop, list)
+			snoop_count++, snoop_size += bytes(bo);
+		ErrorF("%s: still allocated %d bo, %ld bytes, in snoop cache\n",
+		       __FUNCTION__, snoop_count, snoop_size);
+	}
+#endif
+
+	kgem_retire(kgem);
+	if (kgem->wedged)
+		kgem_cleanup(kgem);
+
+	kgem->expire(kgem);
+
+	if (kgem->need_purge)
+		kgem_purge_cache(kgem);
+
+	expire = 0;
+
+	idle = !kgem->need_retire;
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+		idle &= list_is_empty(&kgem->inactive[i]);
+		list_for_each_entry(bo, &kgem->inactive[i], list) {
+			if (bo->delta) {
+				expire = now - MAX_INACTIVE_TIME;
+				break;
+			}
+
+			bo->delta = now;
+		}
+	}
+	if (idle) {
+		DBG(("%s: idle\n", __FUNCTION__));
+		kgem->need_expire = false;
+		return false;
+	}
+	if (expire == 0)
+		return true;
+
+	idle = !kgem->need_retire;
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+		struct list preserve;
+
+		list_init(&preserve);
+		while (!list_is_empty(&kgem->inactive[i])) {
+			bo = list_last_entry(&kgem->inactive[i],
+					     struct kgem_bo, list);
+
+			if (bo->delta > expire) {
+				idle = false;
+				break;
+			}
+
+			if (bo->map && bo->delta + MAP_PRESERVE_TIME > expire) {
+				idle = false;
+				list_move_tail(&bo->list, &preserve);
+			} else {
+				count++;
+				size += bytes(bo);
+				kgem_bo_free(kgem, bo);
+				DBG(("%s: expiring %d\n",
+				     __FUNCTION__, bo->handle));
+			}
+		}
+		if (!list_is_empty(&preserve)) {
+			preserve.prev->next = kgem->inactive[i].next;
+			kgem->inactive[i].next->prev = preserve.prev;
+			kgem->inactive[i].next = preserve.next;
+			preserve.next->prev = &kgem->inactive[i];
+		}
+	}
+
+#ifdef DEBUG_MEMORY
+	{
+		long inactive_size = 0;
+		int inactive_count = 0;
+		for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
+			list_for_each_entry(bo, &kgem->inactive[i], list)
+				inactive_count++, inactive_size += bytes(bo);
+		ErrorF("%s: still allocated %d bo, %ld bytes, in inactive cache\n",
+		       __FUNCTION__, inactive_count, inactive_size);
+	}
+#endif
+
+	DBG(("%s: expired %d objects, %d bytes, idle? %d\n",
+	     __FUNCTION__, count, size, idle));
+
+	kgem->need_expire = !idle;
+	return !idle;
+	(void)count;
+	(void)size;
+}
+
+void kgem_cleanup_cache(struct kgem *kgem)
+{
+	unsigned int i;
+	int n;
+
+	/* sync to the most recent request */
+	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+		if (!list_is_empty(&kgem->requests[n])) {
+			struct kgem_request *rq;
+			struct drm_i915_gem_set_domain set_domain;
+
+			rq = list_first_entry(&kgem->requests[n],
+					      struct kgem_request,
+					      list);
+
+			DBG(("%s: sync on cleanup\n", __FUNCTION__));
+
+			VG_CLEAR(set_domain);
+			set_domain.handle = rq->bo->handle;
+			set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+			set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+			(void)drmIoctl(kgem->fd,
+				       DRM_IOCTL_I915_GEM_SET_DOMAIN,
+				       &set_domain);
+		}
+	}
+
+	kgem_retire(kgem);
+	kgem_cleanup(kgem);
+
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) {
+		while (!list_is_empty(&kgem->inactive[i]))
+			kgem_bo_free(kgem,
+				     list_last_entry(&kgem->inactive[i],
+						     struct kgem_bo, list));
+	}
+
+	while (!list_is_empty(&kgem->snoop))
+		kgem_bo_free(kgem,
+			     list_last_entry(&kgem->snoop,
+					     struct kgem_bo, list));
+
+	while (__kgem_freed_bo) {
+		struct kgem_bo *bo = __kgem_freed_bo;
+		__kgem_freed_bo = *(struct kgem_bo **)bo;
+		free(bo);
+	}
+
+	kgem->need_purge = false;
+	kgem->need_expire = false;
+}
+
+static struct kgem_bo *
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
+{
+	struct kgem_bo *bo, *first = NULL;
+	bool use_active = (flags & CREATE_INACTIVE) == 0;
+	struct list *cache;
+
+	DBG(("%s: num_pages=%d, flags=%x, use_active? %d\n",
+	     __FUNCTION__, num_pages, flags, use_active));
+
+	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
+		return NULL;
+
+	if (!use_active && list_is_empty(inactive(kgem, num_pages))) {
+		DBG(("%s: inactive and cache bucket empty\n",
+		     __FUNCTION__));
+
+		if (flags & CREATE_NO_RETIRE) {
+			DBG(("%s: can not retire\n", __FUNCTION__));
+			return NULL;
+		}
+
+		if (list_is_empty(active(kgem, num_pages, I915_TILING_NONE))) {
+			DBG(("%s: active cache bucket empty\n", __FUNCTION__));
+			return NULL;
+		}
+
+		if (!__kgem_throttle_retire(kgem, 0)) {
+			DBG(("%s: nothing retired\n", __FUNCTION__));
+			return NULL;
+		}
+
+		if (list_is_empty(inactive(kgem, num_pages))) {
+			DBG(("%s: active cache bucket still empty after retire\n",
+			     __FUNCTION__));
+			return NULL;
+		}
+	}
+
+	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+		int for_cpu = !!(flags & CREATE_CPU_MAP);
+		DBG(("%s: searching for inactive %s map\n",
+		     __FUNCTION__, for_cpu ? "cpu" : "gtt"));
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
+		list_for_each_entry(bo, cache, vma) {
+			assert(IS_CPU_MAP(bo->map) == for_cpu);
+			assert(bucket(bo) == cache_bucket(num_pages));
+			assert(bo->proxy == NULL);
+			assert(bo->rq == NULL);
+			assert(bo->exec == NULL);
+
+			if (num_pages > num_pages(bo)) {
+				DBG(("inactive too small: %d < %d\n",
+				     num_pages(bo), num_pages));
+				continue;
+			}
+
+			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+				kgem_bo_free(kgem, bo);
+				break;
+			}
+
+			if (I915_TILING_NONE != bo->tiling &&
+			    gem_set_tiling(kgem->fd, bo->handle,
+					   I915_TILING_NONE, 0) != I915_TILING_NONE)
+				continue;
+
+			kgem_bo_remove_from_inactive(kgem, bo);
+
+			bo->tiling = I915_TILING_NONE;
+			bo->pitch = 0;
+			bo->delta = 0;
+			DBG(("  %s: found handle=%d (num_pages=%d) in linear vma cache\n",
+			     __FUNCTION__, bo->handle, num_pages(bo)));
+			assert(use_active || bo->domain != DOMAIN_GPU);
+			assert(!bo->needs_flush);
+			//assert(!kgem_busy(kgem, bo->handle));
+			return bo;
+		}
+
+		if (flags & CREATE_EXACT)
+			return NULL;
+	}
+
+	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
+	list_for_each_entry(bo, cache, list) {
+		assert(bo->refcnt == 0);
+		assert(bo->reusable);
+		assert(!!bo->rq == !!use_active);
+		assert(bo->proxy == NULL);
+
+		if (num_pages > num_pages(bo))
+			continue;
+
+		if (use_active &&
+		    kgem->gen <= 40 &&
+		    bo->tiling != I915_TILING_NONE)
+			continue;
+
+		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+			kgem_bo_free(kgem, bo);
+			break;
+		}
+
+		if (I915_TILING_NONE != bo->tiling) {
+			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP))
+				continue;
+
+			if (first)
+				continue;
+
+			if (gem_set_tiling(kgem->fd, bo->handle,
+					   I915_TILING_NONE, 0) != I915_TILING_NONE)
+				continue;
+
+			bo->tiling = I915_TILING_NONE;
+		}
+
+		if (bo->map) {
+			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+				int for_cpu = !!(flags & CREATE_CPU_MAP);
+				if (IS_CPU_MAP(bo->map) != for_cpu) {
+					if (first != NULL)
+						break;
+
+					first = bo;
+					continue;
+				}
+			} else {
+				if (first != NULL)
+					break;
+
+				first = bo;
+				continue;
+			}
+		} else {
+			if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+				if (first != NULL)
+					break;
+
+				first = bo;
+				continue;
+			}
+		}
+
+		if (use_active)
+			kgem_bo_remove_from_active(kgem, bo);
+		else
+			kgem_bo_remove_from_inactive(kgem, bo);
+
+		assert(bo->tiling == I915_TILING_NONE);
+		bo->pitch = 0;
+		bo->delta = 0;
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, bo->handle, num_pages(bo),
+		     use_active ? "active" : "inactive"));
+		assert(list_is_empty(&bo->list));
+		assert(use_active || bo->domain != DOMAIN_GPU);
+		assert(!bo->needs_flush || use_active);
+		//assert(use_active || !kgem_busy(kgem, bo->handle));
+		return bo;
+	}
+
+	if (first) {
+		assert(first->tiling == I915_TILING_NONE);
+
+		if (use_active)
+			kgem_bo_remove_from_active(kgem, first);
+		else
+			kgem_bo_remove_from_inactive(kgem, first);
+
+		first->pitch = 0;
+		first->delta = 0;
+		DBG(("  %s: found handle=%d (near-miss) (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, first->handle, num_pages(first),
+		     use_active ? "active" : "inactive"));
+		assert(list_is_empty(&first->list));
+		assert(use_active || first->domain != DOMAIN_GPU);
+		assert(!first->needs_flush || use_active);
+		//assert(use_active || !kgem_busy(kgem, first->handle));
+		return first;
+	}
+
+	return NULL;
+}
+
+struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
+{
+	struct drm_gem_open open_arg;
+	struct kgem_bo *bo;
+
+	DBG(("%s(name=%d)\n", __FUNCTION__, name));
+
+	VG_CLEAR(open_arg);
+	open_arg.name = name;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_GEM_OPEN, &open_arg))
+		return NULL;
+
+	DBG(("%s: new handle=%d\n", __FUNCTION__, open_arg.handle));
+	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size / PAGE_SIZE);
+	if (bo == NULL) {
+		gem_close(kgem->fd, open_arg.handle);
+		return NULL;
+	}
+
+	bo->reusable = false;
+	bo->flush = true;
+
+	debug_alloc__bo(kgem, bo);
+	return bo;
+}
+
+struct kgem_bo *kgem_create_for_prime(struct kgem *kgem, int name, uint32_t size)
+{
+#ifdef DRM_IOCTL_PRIME_FD_TO_HANDLE
+	struct drm_prime_handle args;
+	struct drm_i915_gem_get_tiling tiling;
+	struct kgem_bo *bo;
+
+	DBG(("%s(name=%d)\n", __FUNCTION__, name));
+
+	VG_CLEAR(args);
+	args.fd = name;
+	args.flags = 0;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &args))
+		return NULL;
+
+	VG_CLEAR(tiling);
+	tiling.handle = args.handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling)) {
+		gem_close(kgem->fd, args.handle);
+		return NULL;
+	}
+
+	DBG(("%s: new handle=%d, tiling=%d\n", __FUNCTION__,
+	     args.handle, tiling.tiling_mode));
+	bo = __kgem_bo_alloc(args.handle, NUM_PAGES(size));
+	if (bo == NULL) {
+		gem_close(kgem->fd, args.handle);
+		return NULL;
+	}
+
+	bo->tiling = tiling.tiling_mode;
+	bo->reusable = false;
+
+	debug_alloc__bo(kgem, bo);
+	return bo;
+#else
+	return NULL;
+#endif
+}
+
+int kgem_bo_export_to_prime(struct kgem *kgem, struct kgem_bo *bo)
+{
+#if defined(DRM_IOCTL_PRIME_HANDLE_TO_FD) && defined(O_CLOEXEC)
+	struct drm_prime_handle args;
+
+	VG_CLEAR(args);
+	args.handle = bo->handle;
+	args.flags = O_CLOEXEC;
+
+	if (drmIoctl(kgem->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args))
+		return -1;
+
+	bo->reusable = false;
+	return args.fd;
+#else
+	return -1;
+#endif
+}
+
+struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags)
+{
+	struct kgem_bo *bo;
+	uint32_t handle;
+
+	DBG(("%s(%d)\n", __FUNCTION__, size));
+
+	if (flags & CREATE_GTT_MAP && kgem->has_llc) {
+		flags &= ~CREATE_GTT_MAP;
+		flags |= CREATE_CPU_MAP;
+	}
+
+	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	bo = search_linear_cache(kgem, size, CREATE_INACTIVE | flags);
+	if (bo) {
+		bo->refcnt = 1;
+		return bo;
+	}
+
+	handle = gem_create(kgem->fd, size);
+	if (handle == 0)
+		return NULL;
+
+	DBG(("%s: new handle=%d, num_pages=%d\n", __FUNCTION__, handle, size));
+	bo = __kgem_bo_alloc(handle, size);
+	if (bo == NULL) {
+		gem_close(kgem->fd, handle);
+		return NULL;
+	}
+
+	debug_alloc__bo(kgem, bo);
+	return bo;
+}
+
+int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int bpp)
+{
+	if (DBG_NO_TILING)
+		return tiling < 0 ? tiling : I915_TILING_NONE;
+
+	if (kgem->gen < 40) {
+		if (tiling && width * bpp > 8192 * 8) {
+			DBG(("%s: pitch too large for tliing [%d]\n",
+			     __FUNCTION__, width*bpp/8));
+			tiling = I915_TILING_NONE;
+			goto done;
+		}
+	} else {
+		if (width*bpp > (MAXSHORT-512) * 8) {
+			DBG(("%s: large pitch [%d], forcing TILING_X\n",
+			     __FUNCTION__, width*bpp/8));
+			if (tiling > 0)
+				tiling = -tiling;
+			else if (tiling == 0)
+				tiling = -I915_TILING_X;
+		} else if (tiling && (width|height) > 8192) {
+			DBG(("%s: large tiled buffer [%dx%d], forcing TILING_X\n",
+			     __FUNCTION__, width, height));
+			tiling = -I915_TILING_X;
+		}
+	}
+
+	if (tiling < 0)
+		return tiling;
+
+	if (tiling && height == 1) {
+		DBG(("%s: disabling tiling [%d] for single row\n",
+		     __FUNCTION__,height));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
+	if (tiling == I915_TILING_Y && height <= 16) {
+		DBG(("%s: too short [%d] for TILING_Y\n",
+		     __FUNCTION__,height));
+		tiling = I915_TILING_X;
+	}
+	if (tiling && width * bpp > 8 * (4096 - 64)) {
+		DBG(("%s: TLB miss between lines %dx%d (pitch=%d), forcing tiling %d\n",
+		     __FUNCTION__,
+		     width, height, width*bpp/8,
+		     tiling));
+		return -tiling;
+	}
+	if (tiling == I915_TILING_X && height < 4) {
+		DBG(("%s: too short [%d] for TILING_X\n",
+		     __FUNCTION__, height));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
+
+	if (tiling == I915_TILING_X && width * bpp <= 8*512/2) {
+		DBG(("%s: too thin [width %d, %d bpp] for TILING_X\n",
+		     __FUNCTION__, width, bpp));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
+	if (tiling == I915_TILING_Y && width * bpp <= 8*128/2) {
+		DBG(("%s: too thin [%d] for TILING_Y\n",
+		     __FUNCTION__, width));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
+
+	if (tiling && ALIGN(height, 2) * ALIGN(width*bpp, 8*64) <= 4096 * 8) {
+		DBG(("%s: too small [%d bytes] for TILING_%c\n", __FUNCTION__,
+		     ALIGN(height, 2) * ALIGN(width*bpp, 8*64) / 8,
+		     tiling == I915_TILING_X ? 'X' : 'Y'));
+		tiling = I915_TILING_NONE;
+		goto done;
+	}
+
+	if (tiling && width * bpp >= 8 * 4096 / 2) {
+		DBG(("%s: TLB near-miss between lines %dx%d (pitch=%d), forcing tiling %d\n",
+		     __FUNCTION__,
+		     width, height, width*bpp/8,
+		     tiling));
+		return -tiling;
+	}
+
+done:
+	DBG(("%s: %dx%d -> %d\n", __FUNCTION__, width, height, tiling));
+	return tiling;
+}
+
+static int bits_per_pixel(int depth)
+{
+	switch (depth) {
+	case 8: return 8;
+	case 15:
+	case 16: return 16;
+	case 24:
+	case 30:
+	case 32: return 32;
+	default: return 0;
+	}
+}
+
+unsigned kgem_can_create_2d(struct kgem *kgem,
+			    int width, int height, int depth)
+{
+	uint32_t pitch, size;
+	unsigned flags = 0;
+	int bpp;
+
+	DBG(("%s: %dx%d @ %d\n", __FUNCTION__, width, height, depth));
+
+	bpp = bits_per_pixel(depth);
+	if (bpp == 0) {
+		DBG(("%s: unhandled depth %d\n", __FUNCTION__, depth));
+		return 0;
+	}
+
+	if (width > MAXSHORT || height > MAXSHORT) {
+		DBG(("%s: unhandled size %dx%d\n",
+		     __FUNCTION__, width, height));
+		return 0;
+	}
+
+	size = kgem_surface_size(kgem, false, 0,
+				 width, height, bpp,
+				 I915_TILING_NONE, &pitch);
+	if (size > 0 && size <= kgem->max_cpu_size)
+		flags |= KGEM_CAN_CREATE_CPU | KGEM_CAN_CREATE_GPU;
+	if (size > 0 && size <= kgem->aperture_mappable/4)
+		flags |= KGEM_CAN_CREATE_GTT;
+	if (size > kgem->large_object_size)
+		flags |= KGEM_CAN_CREATE_LARGE;
+	if (size > kgem->max_object_size) {
+		DBG(("%s: too large (untiled) %d > %d\n",
+		     __FUNCTION__, size, kgem->max_object_size));
+		return 0;
+	}
+
+	size = kgem_surface_size(kgem, false, 0,
+				 width, height, bpp,
+				 kgem_choose_tiling(kgem, I915_TILING_X,
+						    width, height, bpp),
+				 &pitch);
+	if (size > 0 && size <= kgem->max_gpu_size)
+		flags |= KGEM_CAN_CREATE_GPU;
+	if (size > 0 && size <= kgem->aperture_mappable/4)
+		flags |= KGEM_CAN_CREATE_GTT;
+	if (size > kgem->large_object_size)
+		flags |= KGEM_CAN_CREATE_LARGE;
+	if (size > kgem->max_object_size) {
+		DBG(("%s: too large (tiled) %d > %d\n",
+		     __FUNCTION__, size, kgem->max_object_size));
+		return 0;
+	}
+
+	return flags;
+}
+
+inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
+{
+	unsigned int size;
+
+	assert(bo->tiling);
+	assert(kgem->gen < 40);
+
+	if (kgem->gen < 30)
+		size = 512 * 1024;
+	else
+		size = 1024 * 1024;
+	while (size < bytes(bo))
+		size *= 2;
+
+	return size;
+}
+
+struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+			       int width,
+			       int height,
+			       int bpp,
+			       int tiling,
+			       uint32_t flags)
+{
+	struct list *cache;
+	struct kgem_bo *bo;
+	uint32_t pitch, untiled_pitch, tiled_height, size;
+	uint32_t handle;
+	int i, bucket, retry;
+
+	if (tiling < 0)
+		tiling = -tiling, flags |= CREATE_EXACT;
+
+	DBG(("%s(%dx%d, bpp=%d, tiling=%d, exact=%d, inactive=%d, cpu-mapping=%d, gtt-mapping=%d, scanout?=%d, prime?=%d, temp?=%d)\n", __FUNCTION__,
+	     width, height, bpp, tiling,
+	     !!(flags & CREATE_EXACT),
+	     !!(flags & CREATE_INACTIVE),
+	     !!(flags & CREATE_CPU_MAP),
+	     !!(flags & CREATE_GTT_MAP),
+	     !!(flags & CREATE_SCANOUT),
+	     !!(flags & CREATE_PRIME),
+	     !!(flags & CREATE_TEMPORARY)));
+
+	size = kgem_surface_size(kgem, kgem->has_relaxed_fencing, flags,
+				 width, height, bpp, tiling, &pitch);
+	assert(size && size <= kgem->max_object_size);
+	size /= PAGE_SIZE;
+	bucket = cache_bucket(size);
+
+	if (bucket >= NUM_CACHE_BUCKETS) {
+		DBG(("%s: large bo num pages=%d, bucket=%d\n",
+		     __FUNCTION__, size, bucket));
+
+		if (flags & CREATE_INACTIVE)
+			goto large_inactive;
+
+		tiled_height = kgem_aligned_height(kgem, height, tiling);
+		untiled_pitch = kgem_untiled_pitch(kgem, width, bpp, flags);
+
+		list_for_each_entry(bo, &kgem->large, list) {
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+
+			if (kgem->gen < 40) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
+
+				if (bo->pitch * tiled_height > bytes(bo))
+					continue;
+			} else {
+				if (num_pages(bo) < size)
+					continue;
+
+				if (bo->pitch != pitch || bo->tiling != tiling) {
+					if (gem_set_tiling(kgem->fd, bo->handle,
+							   tiling, pitch) != tiling)
+						continue;
+
+					bo->pitch = pitch;
+				}
+			}
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+			bo->refcnt = 1;
+			return bo;
+		}
+
+large_inactive:
+		list_for_each_entry(bo, &kgem->large_inactive, list) {
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+
+			if (size > num_pages(bo))
+				continue;
+
+			if (bo->tiling != tiling ||
+			    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+				if (tiling != gem_set_tiling(kgem->fd,
+							     bo->handle,
+							     tiling, pitch))
+					continue;
+			}
+
+			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+				kgem_bo_free(kgem, bo);
+				break;
+			}
+
+			list_del(&bo->list);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->pitch = pitch;
+			bo->delta = 0;
+			DBG(("  1:from large inactive: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+			bo->refcnt = 1;
+			return bo;
+		}
+
+		goto create;
+	}
+
+	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
+		int for_cpu = !!(flags & CREATE_CPU_MAP);
+		if (kgem->has_llc && tiling == I915_TILING_NONE)
+			for_cpu = 1;
+		/* We presume that we will need to upload to this bo,
+		 * and so would prefer to have an active VMA.
+		 */
+		cache = &kgem->vma[for_cpu].inactive[bucket];
+		do {
+			list_for_each_entry(bo, cache, vma) {
+				assert(bucket(bo) == bucket);
+				assert(bo->refcnt == 0);
+				assert(bo->map);
+				assert(IS_CPU_MAP(bo->map) == for_cpu);
+				assert(bo->rq == NULL);
+				assert(list_is_empty(&bo->request));
+
+				if (size > num_pages(bo)) {
+					DBG(("inactive too small: %d < %d\n",
+					     num_pages(bo), size));
+					continue;
+				}
+
+				if (bo->tiling != tiling ||
+				    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+					DBG(("inactive vma with wrong tiling: %d < %d\n",
+					     bo->tiling, tiling));
+					continue;
+				}
+
+				if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+					kgem_bo_free(kgem, bo);
+					break;
+				}
+
+				bo->pitch = pitch;
+				bo->delta = 0;
+				bo->unique_id = kgem_get_unique_id(kgem);
+
+				kgem_bo_remove_from_inactive(kgem, bo);
+
+				DBG(("  from inactive vma: pitch=%d, tiling=%d: handle=%d, id=%d\n",
+				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+				assert(bo->reusable);
+				assert(bo->domain != DOMAIN_GPU && !kgem_busy(kgem, bo->handle));
+				assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+				bo->refcnt = 1;
+				return bo;
+			}
+		} while (!list_is_empty(cache) &&
+			 __kgem_throttle_retire(kgem, flags));
+	}
+
+	if (flags & CREATE_INACTIVE)
+		goto skip_active_search;
+
+	/* Best active match */
+	retry = NUM_CACHE_BUCKETS - bucket;
+	if (retry > 3 && (flags & CREATE_TEMPORARY) == 0)
+		retry = 3;
+search_again:
+	assert(bucket < NUM_CACHE_BUCKETS);
+	cache = &kgem->active[bucket][tiling];
+	if (tiling) {
+		tiled_height = kgem_aligned_height(kgem, height, tiling);
+		list_for_each_entry(bo, cache, list) {
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bucket(bo) == bucket);
+			assert(bo->reusable);
+			assert(bo->tiling == tiling);
+
+			if (kgem->gen < 40) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
+
+				if (bo->pitch * tiled_height > bytes(bo))
+					continue;
+			} else {
+				if (num_pages(bo) < size)
+					continue;
+
+				if (bo->pitch != pitch) {
+					gem_set_tiling(kgem->fd,
+						       bo->handle,
+						       tiling, pitch);
+
+					bo->pitch = pitch;
+				}
+			}
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+			bo->refcnt = 1;
+			return bo;
+		}
+	} else {
+		list_for_each_entry(bo, cache, list) {
+			assert(bucket(bo) == bucket);
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+			assert(bo->tiling == tiling);
+
+			if (num_pages(bo) < size)
+				continue;
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->pitch = pitch;
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+			bo->refcnt = 1;
+			return bo;
+		}
+	}
+
+	if (--retry && flags & CREATE_EXACT) {
+		if (kgem->gen >= 40) {
+			for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) {
+				if (i == tiling)
+					continue;
+
+				cache = &kgem->active[bucket][i];
+				list_for_each_entry(bo, cache, list) {
+					assert(!bo->purged);
+					assert(bo->refcnt == 0);
+					assert(bo->reusable);
+
+					if (num_pages(bo) < size)
+						continue;
+
+					if (tiling != gem_set_tiling(kgem->fd,
+								     bo->handle,
+								     tiling, pitch))
+						continue;
+
+					kgem_bo_remove_from_active(kgem, bo);
+
+					bo->unique_id = kgem_get_unique_id(kgem);
+					bo->pitch = pitch;
+					bo->tiling = tiling;
+					bo->delta = 0;
+					DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+					     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+					assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+					bo->refcnt = 1;
+					return bo;
+				}
+			}
+		}
+
+		bucket++;
+		goto search_again;
+	}
+
+	if ((flags & CREATE_EXACT) == 0) { /* allow an active near-miss? */
+		untiled_pitch = kgem_untiled_pitch(kgem, width, bpp, flags);
+		i = tiling;
+		while (--i >= 0) {
+			tiled_height = kgem_surface_size(kgem, kgem->has_relaxed_fencing, flags,
+							 width, height, bpp, tiling, &pitch);
+			cache = active(kgem, tiled_height / PAGE_SIZE, i);
+			tiled_height = kgem_aligned_height(kgem, height, i);
+			list_for_each_entry(bo, cache, list) {
+				assert(!bo->purged);
+				assert(bo->refcnt == 0);
+				assert(bo->reusable);
+
+				if (bo->tiling) {
+					if (bo->pitch < pitch) {
+						DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+						     bo->tiling, tiling,
+						     bo->pitch, pitch));
+						continue;
+					}
+				} else
+					bo->pitch = untiled_pitch;
+
+				if (bo->pitch * tiled_height > bytes(bo))
+					continue;
+
+				kgem_bo_remove_from_active(kgem, bo);
+
+				bo->unique_id = kgem_get_unique_id(kgem);
+				bo->delta = 0;
+				DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+				assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+				bo->refcnt = 1;
+				return bo;
+			}
+		}
+	}
+
+skip_active_search:
+	bucket = cache_bucket(size);
+	retry = NUM_CACHE_BUCKETS - bucket;
+	if (retry > 3)
+		retry = 3;
+search_inactive:
+	/* Now just look for a close match and prefer any currently active */
+	assert(bucket < NUM_CACHE_BUCKETS);
+	cache = &kgem->inactive[bucket];
+	list_for_each_entry(bo, cache, list) {
+		assert(bucket(bo) == bucket);
+		assert(bo->reusable);
+
+		if (size > num_pages(bo)) {
+			DBG(("inactive too small: %d < %d\n",
+			     num_pages(bo), size));
+			continue;
+		}
+
+		if (bo->tiling != tiling ||
+		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+			if (tiling != gem_set_tiling(kgem->fd,
+						     bo->handle,
+						     tiling, pitch))
+				continue;
+
+			if (bo->map)
+				kgem_bo_release_map(kgem, bo);
+		}
+
+		if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
+			kgem_bo_free(kgem, bo);
+			break;
+		}
+
+		kgem_bo_remove_from_inactive(kgem, bo);
+
+		bo->pitch = pitch;
+		bo->tiling = tiling;
+
+		bo->delta = 0;
+		bo->unique_id = kgem_get_unique_id(kgem);
+		assert(bo->pitch);
+		DBG(("  from inactive: pitch=%d, tiling=%d: handle=%d, id=%d\n",
+		     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+		assert(bo->refcnt == 0);
+		assert(bo->reusable);
+		assert((flags & CREATE_INACTIVE) == 0 || bo->domain != DOMAIN_GPU);
+		assert((flags & CREATE_INACTIVE) == 0 || !kgem_busy(kgem, bo->handle));
+		assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+		bo->refcnt = 1;
+		return bo;
+	}
+
+	if (flags & CREATE_INACTIVE &&
+	    !list_is_empty(&kgem->active[bucket][tiling]) &&
+	    __kgem_throttle_retire(kgem, flags)) {
+		flags &= ~CREATE_INACTIVE;
+		goto search_inactive;
+	}
+
+	if (--retry) {
+		bucket++;
+		flags &= ~CREATE_INACTIVE;
+		goto search_inactive;
+	}
+
+create:
+	if (bucket >= NUM_CACHE_BUCKETS)
+		size = ALIGN(size, 1024);
+	handle = gem_create(kgem->fd, size);
+	if (handle == 0)
+		return NULL;
+
+	bo = __kgem_bo_alloc(handle, size);
+	if (!bo) {
+		gem_close(kgem->fd, handle);
+		return NULL;
+	}
+
+	bo->domain = DOMAIN_CPU;
+	bo->unique_id = kgem_get_unique_id(kgem);
+	bo->pitch = pitch;
+	if (tiling != I915_TILING_NONE)
+		bo->tiling = gem_set_tiling(kgem->fd, handle, tiling, pitch);
+	if (bucket >= NUM_CACHE_BUCKETS) {
+		DBG(("%s: marking large bo for automatic flushing\n",
+		     __FUNCTION__));
+		bo->flush = true;
+	}
+
+	assert(bytes(bo) >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
+
+	debug_alloc__bo(kgem, bo);
+
+	DBG(("  new pitch=%d, tiling=%d, handle=%d, id=%d, num_pages=%d [%d], bucket=%d\n",
+	     bo->pitch, bo->tiling, bo->handle, bo->unique_id,
+	     size, num_pages(bo), bucket(bo)));
+	return bo;
+}
+
+struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
+				   int width,
+				   int height,
+				   int bpp,
+				   uint32_t flags)
+{
+	struct kgem_bo *bo;
+	int stride, size;
+
+	if (DBG_NO_CPU)
+		return NULL;
+
+	DBG(("%s(%dx%d, bpp=%d)\n", __FUNCTION__, width, height, bpp));
+
+	if (kgem->has_llc) {
+		bo = kgem_create_2d(kgem, width, height, bpp,
+				    I915_TILING_NONE, flags);
+		if (bo == NULL)
+			return bo;
+
+		assert(bo->tiling == I915_TILING_NONE);
+
+		if (kgem_bo_map__cpu(kgem, bo) == NULL) {
+			kgem_bo_destroy(kgem, bo);
+			return NULL;
+		}
+
+		return bo;
+	}
+
+	assert(width > 0 && height > 0);
+	stride = ALIGN(width, 2) * bpp >> 3;
+	stride = ALIGN(stride, 4);
+	size = stride * ALIGN(height, 2);
+	assert(size >= PAGE_SIZE);
+
+	DBG(("%s: %dx%d, %d bpp, stride=%d\n",
+	     __FUNCTION__, width, height, bpp, stride));
+
+	bo = search_snoop_cache(kgem, NUM_PAGES(size), 0);
+	if (bo) {
+		assert(bo->tiling == I915_TILING_NONE);
+		assert(bo->snoop);
+		bo->refcnt = 1;
+		bo->pitch = stride;
+		bo->unique_id = kgem_get_unique_id(kgem);
+		return bo;
+	}
+
+	if (kgem->has_cacheing) {
+		bo = kgem_create_linear(kgem, size, flags);
+		if (bo == NULL)
+			return NULL;
+
+		assert(bo->tiling == I915_TILING_NONE);
+
+		if (!gem_set_cacheing(kgem->fd, bo->handle, SNOOPED)) {
+			kgem_bo_destroy(kgem, bo);
+			return NULL;
+		}
+		bo->snoop = true;
+
+		if (kgem_bo_map__cpu(kgem, bo) == NULL) {
+			kgem_bo_destroy(kgem, bo);
+			return NULL;
+		}
+
+		bo->pitch = stride;
+		bo->unique_id = kgem_get_unique_id(kgem);
+		return bo;
+	}
+
+	if (kgem->has_userptr) {
+		void *ptr;
+
+		/* XXX */
+		//if (posix_memalign(&ptr, 64, ALIGN(size, 64)))
+		if (posix_memalign(&ptr, PAGE_SIZE, ALIGN(size, PAGE_SIZE)))
+			return NULL;
+
+		bo = kgem_create_map(kgem, ptr, size, false);
+		if (bo == NULL) {
+			free(ptr);
+			return NULL;
+		}
+
+		bo->map = MAKE_USER_MAP(ptr);
+		bo->pitch = stride;
+		bo->unique_id = kgem_get_unique_id(kgem);
+		return bo;
+	}
+
+	return NULL;
+}
+
+void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d, proxy? %d\n",
+	     __FUNCTION__, bo->handle, bo->proxy != NULL));
+
+	if (bo->proxy) {
+		_list_del(&bo->vma);
+		_list_del(&bo->request);
+		if (bo->io && bo->exec == NULL)
+			_kgem_bo_delete_buffer(kgem, bo);
+		kgem_bo_unref(kgem, bo->proxy);
+		kgem_bo_binding_free(kgem, bo);
+		free(bo);
+		return;
+	}
+
+	__kgem_bo_destroy(kgem, bo);
+}
+
+bool __kgem_flush(struct kgem *kgem, struct kgem_bo *bo)
+{
+	/* The kernel will emit a flush *and* update its own flushing lists. */
+	if (!bo->needs_flush)
+		return false;
+
+	bo->needs_flush = kgem_busy(kgem, bo->handle);
+	DBG(("%s: handle=%d, busy?=%d\n",
+	     __FUNCTION__, bo->handle, bo->needs_flush));
+	return bo->needs_flush;
+}
+
+bool kgem_check_bo(struct kgem *kgem, ...)
+{
+	va_list ap;
+	struct kgem_bo *bo;
+	int num_exec = 0;
+	int num_pages = 0;
+
+	if (kgem_flush(kgem))
+		return false;
+
+	va_start(ap, kgem);
+	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		if (bo->exec)
+			continue;
+
+		while (bo->proxy) {
+			bo = bo->proxy;
+			if (bo->exec)
+				continue;
+		}
+		num_pages += num_pages(bo);
+		num_exec++;
+	}
+	va_end(ap);
+
+	DBG(("%s: num_pages=+%d, num_exec=+%d\n",
+	     __FUNCTION__, num_pages, num_exec));
+
+	if (!num_pages)
+		return true;
+
+	if (kgem->aperture > kgem->aperture_low && kgem_is_idle(kgem)) {
+		DBG(("%s: current aperture usage (%d) is greater than low water mark (%d)\n",
+		     __FUNCTION__, kgem->aperture, kgem->aperture_low));
+		return false;
+	}
+
+	if (num_pages + kgem->aperture > kgem->aperture_high) {
+		DBG(("%s: final aperture usage (%d) is greater than high water mark (%d)\n",
+		     __FUNCTION__, num_pages + kgem->aperture, kgem->aperture_high));
+		return false;
+	}
+
+	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem)) {
+		DBG(("%s: out of exec slots (%d + %d / %d)\n", __FUNCTION__,
+		     kgem->nexec, num_exec, KGEM_EXEC_SIZE(kgem)));
+		return false;
+	}
+
+	return true;
+}
+
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
+{
+	uint32_t size;
+
+	if (kgem_flush(kgem))
+		return false;
+
+	while (bo->proxy)
+		bo = bo->proxy;
+	if (bo->exec) {
+		if (kgem->gen < 40 &&
+		    bo->tiling != I915_TILING_NONE &&
+		    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+			if (kgem->nfence >= kgem->fence_max)
+				return false;
+
+			size = kgem->aperture_fenced;
+			size += kgem_bo_fenced_size(kgem, bo);
+			if (4*size > 3*kgem->aperture_mappable)
+				return false;
+		}
+
+		return true;
+	}
+
+	if (kgem->nexec >= KGEM_EXEC_SIZE(kgem) - 1)
+		return false;
+
+	if (kgem->aperture > kgem->aperture_low)
+		return false;
+
+	if (kgem->aperture + num_pages(bo) > kgem->aperture_high)
+		return false;
+
+	if (kgem->gen < 40 && bo->tiling != I915_TILING_NONE) {
+		if (kgem->nfence >= kgem->fence_max)
+			return false;
+
+		if (2*kgem->aperture_fenced > kgem->aperture_mappable)
+			return false;
+
+		size = kgem->aperture_fenced;
+		size += kgem_bo_fenced_size(kgem, bo);
+		if (4*size > 3*kgem->aperture_mappable)
+			return false;
+	}
+
+	return true;
+}
+
+bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
+{
+	va_list ap;
+	struct kgem_bo *bo;
+	int num_fence = 0;
+	int num_exec = 0;
+	int num_pages = 0;
+	int fenced_size = 0;
+
+	if (kgem_flush(kgem))
+		return false;
+
+	va_start(ap, kgem);
+	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		while (bo->proxy)
+			bo = bo->proxy;
+		if (bo->exec) {
+			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
+				continue;
+
+			if ((bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+				fenced_size += kgem_bo_fenced_size(kgem, bo);
+				num_fence++;
+			}
+
+			continue;
+		}
+
+		num_pages += num_pages(bo);
+		num_exec++;
+		if (kgem->gen < 40 && bo->tiling) {
+			fenced_size += kgem_bo_fenced_size(kgem, bo);
+			num_fence++;
+		}
+	}
+	va_end(ap);
+
+	if (num_fence) {
+		if (kgem->nfence + num_fence > kgem->fence_max)
+			return false;
+
+		if (2*kgem->aperture_fenced > kgem->aperture_mappable)
+			return false;
+
+		if (4*(fenced_size + kgem->aperture_fenced) > 3*kgem->aperture_mappable)
+			return false;
+	}
+
+	if (num_pages) {
+		if (kgem->aperture > kgem->aperture_low)
+			return false;
+
+		if (num_pages + kgem->aperture > kgem->aperture_high)
+			return false;
+
+		if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
+			return false;
+	}
+
+	return true;
+}
+
+uint32_t kgem_add_reloc(struct kgem *kgem,
+			uint32_t pos,
+			struct kgem_bo *bo,
+			uint32_t read_write_domain,
+			uint32_t delta)
+{
+	int index;
+
+	DBG(("%s: handle=%d, pos=%d, delta=%d, domains=%08x\n",
+	     __FUNCTION__, bo ? bo->handle : 0, pos, delta, read_write_domain));
+
+	assert((read_write_domain & 0x7fff) == 0 || bo != NULL);
+
+	index = kgem->nreloc++;
+	assert(index < ARRAY_SIZE(kgem->reloc));
+	kgem->reloc[index].offset = pos * sizeof(kgem->batch[0]);
+	if (bo) {
+		assert(bo->refcnt);
+		assert(!bo->purged);
+
+		while (bo->proxy) {
+			DBG(("%s: adding proxy [delta=%d] for handle=%d\n",
+			     __FUNCTION__, bo->delta, bo->handle));
+			delta += bo->delta;
+			assert(bo->handle == bo->proxy->handle);
+			/* need to release the cache upon batch submit */
+			if (bo->exec == NULL) {
+				list_move_tail(&bo->request,
+					       &kgem->next_request->buffers);
+				bo->rq = kgem->next_request;
+				bo->exec = &_kgem_dummy_exec;
+			}
+
+			bo = bo->proxy;
+			assert(bo->refcnt);
+			assert(!bo->purged);
+		}
+
+		if (bo->exec == NULL)
+			_kgem_add_bo(kgem, bo);
+		assert(bo->rq == kgem->next_request);
+
+		if (kgem->gen < 40 && read_write_domain & KGEM_RELOC_FENCED) {
+			if (bo->tiling &&
+			    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
+				assert(kgem->nfence < kgem->fence_max);
+				kgem->aperture_fenced +=
+					kgem_bo_fenced_size(kgem, bo);
+				kgem->nfence++;
+			}
+			bo->exec->flags |= EXEC_OBJECT_NEEDS_FENCE;
+		}
+
+		kgem->reloc[index].delta = delta;
+		kgem->reloc[index].target_handle = bo->handle;
+		kgem->reloc[index].presumed_offset = bo->presumed_offset;
+
+		if (read_write_domain & 0x7ff) {
+			assert(!bo->snoop || kgem->can_blt_cpu);
+			kgem_bo_mark_dirty(bo);
+		}
+
+		delta += bo->presumed_offset;
+	} else {
+		kgem->reloc[index].delta = delta;
+		kgem->reloc[index].target_handle = 0;
+		kgem->reloc[index].presumed_offset = 0;
+	}
+	kgem->reloc[index].read_domains = read_write_domain >> 16;
+	kgem->reloc[index].write_domain = read_write_domain & 0x7fff;
+
+	return delta;
+}
+
+static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
+{
+	int i, j;
+
+	DBG(("%s: type=%d, count=%d (bucket: %d)\n",
+	     __FUNCTION__, type, kgem->vma[type].count, bucket));
+	if (kgem->vma[type].count <= 0)
+	       return;
+
+	if (kgem->need_purge)
+		kgem_purge_cache(kgem);
+
+	/* vma are limited on a per-process basis to around 64k.
+	 * This includes all malloc arenas as well as other file
+	 * mappings. In order to be fair and not hog the cache,
+	 * and more importantly not to exhaust that limit and to
+	 * start failing mappings, we keep our own number of open
+	 * vma to within a conservative value.
+	 */
+	i = 0;
+	while (kgem->vma[type].count > 0) {
+		struct kgem_bo *bo = NULL;
+
+		for (j = 0;
+		     bo == NULL && j < ARRAY_SIZE(kgem->vma[type].inactive);
+		     j++) {
+			struct list *head = &kgem->vma[type].inactive[i++%ARRAY_SIZE(kgem->vma[type].inactive)];
+			if (!list_is_empty(head))
+				bo = list_last_entry(head, struct kgem_bo, vma);
+		}
+		if (bo == NULL)
+			break;
+
+		DBG(("%s: discarding inactive %s vma cache for %d\n",
+		     __FUNCTION__,
+		     IS_CPU_MAP(bo->map) ? "CPU" : "GTT", bo->handle));
+		assert(IS_CPU_MAP(bo->map) == type);
+		assert(bo->map);
+		assert(bo->rq == NULL);
+
+		VG(if (type) VALGRIND_MAKE_MEM_NOACCESS(MAP(bo->map), bytes(bo)));
+		munmap(MAP(bo->map), bytes(bo));
+		bo->map = NULL;
+		list_del(&bo->vma);
+		kgem->vma[type].count--;
+
+		if (!bo->purged && !kgem_bo_set_purgeable(kgem, bo)) {
+			DBG(("%s: freeing unpurgeable old mapping\n",
+			     __FUNCTION__));
+			kgem_bo_free(kgem, bo);
+		}
+	}
+}
+
+void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo)
+{
+	void *ptr;
+
+	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+
+	assert(!bo->purged);
+	assert(bo->proxy == NULL);
+	assert(list_is_empty(&bo->list));
+
+	if (bo->tiling == I915_TILING_NONE && !bo->scanout && kgem->has_llc) {
+		DBG(("%s: converting request for GTT map into CPU map\n",
+		     __FUNCTION__));
+		return kgem_bo_map__cpu(kgem, bo);
+	}
+
+	if (IS_CPU_MAP(bo->map))
+		kgem_bo_release_map(kgem, bo);
+
+	ptr = bo->map;
+	if (ptr == NULL) {
+		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+
+		ptr = __kgem_bo_map__gtt(kgem, bo);
+		if (ptr == NULL)
+			return NULL;
+
+		/* Cache this mapping to avoid the overhead of an
+		 * excruciatingly slow GTT pagefault. This is more an
+		 * issue with compositing managers which need to frequently
+		 * flush CPU damage to their GPU bo.
+		 */
+		bo->map = ptr;
+		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
+	}
+
+	return ptr;
+}
+
+void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+	void *ptr;
+
+	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+
+	assert(!bo->purged);
+	assert(bo->proxy == NULL);
+	assert(list_is_empty(&bo->list));
+	assert(bo->exec == NULL);
+
+	if (bo->tiling == I915_TILING_NONE && !bo->scanout &&
+	    (kgem->has_llc || bo->domain == DOMAIN_CPU)) {
+		DBG(("%s: converting request for GTT map into CPU map\n",
+		     __FUNCTION__));
+		ptr = kgem_bo_map__cpu(kgem, bo);
+		kgem_bo_sync__cpu(kgem, bo);
+		return ptr;
+	}
+
+	if (IS_CPU_MAP(bo->map))
+		kgem_bo_release_map(kgem, bo);
+
+	ptr = bo->map;
+	if (ptr == NULL) {
+		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
+		assert(kgem->gen != 21 || bo->tiling != I915_TILING_Y);
+
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+
+		ptr = __kgem_bo_map__gtt(kgem, bo);
+		if (ptr == NULL)
+			return NULL;
+
+		/* Cache this mapping to avoid the overhead of an
+		 * excruciatingly slow GTT pagefault. This is more an
+		 * issue with compositing managers which need to frequently
+		 * flush CPU damage to their GPU bo.
+		 */
+		bo->map = ptr;
+		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
+	}
+
+	if (bo->domain != DOMAIN_GTT) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+
+		/* XXX use PROT_READ to avoid the write flush? */
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_GTT;
+		}
+	}
+
+	return ptr;
+}
+
+void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	void *ptr;
+
+	DBG(("%s: handle=%d, offset=%d, tiling=%d, map=%p, domain=%d\n", __FUNCTION__,
+	     bo->handle, bo->presumed_offset, bo->tiling, bo->map, bo->domain));
+
+	assert(!bo->purged);
+	assert(bo->exec == NULL);
+	assert(list_is_empty(&bo->list));
+
+	if (IS_CPU_MAP(bo->map))
+		kgem_bo_release_map(kgem, bo);
+
+	ptr = bo->map;
+	if (ptr == NULL) {
+		assert(bytes(bo) <= kgem->aperture_mappable / 4);
+
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+
+		ptr = __kgem_bo_map__gtt(kgem, bo);
+		if (ptr == NULL)
+			return NULL;
+
+		/* Cache this mapping to avoid the overhead of an
+		 * excruciatingly slow GTT pagefault. This is more an
+		 * issue with compositing managers which need to frequently
+		 * flush CPU damage to their GPU bo.
+		 */
+		bo->map = ptr;
+		DBG(("%s: caching GTT vma for %d\n", __FUNCTION__, bo->handle));
+	}
+
+	return ptr;
+}
+
+void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (bo->map)
+		return MAP(bo->map);
+
+	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+	return bo->map = __kgem_bo_map__gtt(kgem, bo);
+}
+
+void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_mmap mmap_arg;
+
+	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+	assert(!bo->purged);
+	assert(list_is_empty(&bo->list));
+	assert(!bo->scanout);
+	assert(bo->proxy == NULL);
+
+	if (IS_CPU_MAP(bo->map))
+		return MAP(bo->map);
+
+	if (bo->map)
+		kgem_bo_release_map(kgem, bo);
+
+	kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
+
+retry:
+	VG_CLEAR(mmap_arg);
+	mmap_arg.handle = bo->handle;
+	mmap_arg.offset = 0;
+	mmap_arg.size = bytes(bo);
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n",
+		       __FUNCTION__, bo->handle, bytes(bo), errno);
+		if (__kgem_throttle_retire(kgem, 0))
+			goto retry;
+
+		return NULL;
+	}
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+
+	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
+	return (void *)(uintptr_t)mmap_arg.addr_ptr;
+}
+
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_mmap mmap_arg;
+
+	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+        assert(bo->refcnt);
+	assert(!bo->purged);
+	assert(list_is_empty(&bo->list));
+	assert(bo->proxy == NULL);
+
+	if (IS_CPU_MAP(bo->map))
+		return MAP(bo->map);
+
+retry:
+	VG_CLEAR(mmap_arg);
+	mmap_arg.handle = bo->handle;
+	mmap_arg.offset = 0;
+	mmap_arg.size = bytes(bo);
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n",
+		       __FUNCTION__, bo->handle, bytes(bo), errno);
+		if (__kgem_throttle_retire(kgem, 0))
+			goto retry;
+
+		return NULL;
+	}
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+	if (bo->map && bo->domain == DOMAIN_CPU) {
+		DBG(("%s: discarding GTT vma for %d\n", __FUNCTION__, bo->handle));
+		kgem_bo_release_map(kgem, bo);
+	}
+	if (bo->map == NULL) {
+		DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+		bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
+	}
+	return (void *)(uintptr_t)mmap_arg.addr_ptr;
+}
+
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr)
+{
+	DBG(("%s(handle=%d, size=%d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo)));
+        assert(bo->refcnt);
+
+	if (IS_CPU_MAP(bo->map)) {
+                assert(ptr == MAP(bo->map));
+                return;
+        }
+
+	munmap(ptr, bytes(bo));
+}
+
+uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_gem_flink flink;
+
+	VG_CLEAR(flink);
+	flink.handle = bo->handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_GEM_FLINK, &flink))
+		return 0;
+
+	DBG(("%s: flinked handle=%d to name=%d, marking non-reusable\n",
+	     __FUNCTION__, flink.handle, flink.name));
+
+	/* Ordinarily giving the name aware makes the buffer non-reusable.
+	 * However, we track the lifetime of all clients and their hold
+	 * on the buffer, and *presuming* they do not pass it on to a third
+	 * party, we track the lifetime accurately.
+	 */
+	bo->reusable = false;
+
+	/* The bo is outside of our control, so presume it is written to */
+	bo->needs_flush = true;
+	if (bo->domain != DOMAIN_GPU)
+		bo->domain = DOMAIN_NONE;
+
+	/* Henceforth, we need to broadcast all updates to clients and
+	 * flush our rendering before doing so.
+	 */
+	bo->flush = true;
+	if (bo->exec)
+		kgem->flush = 1;
+
+	return flink.name;
+}
+
+struct kgem_bo *kgem_create_map(struct kgem *kgem,
+				void *ptr, uint32_t size,
+				bool read_only)
+{
+	struct kgem_bo *bo;
+	uint32_t handle;
+
+	if (!kgem->has_userptr)
+		return NULL;
+
+	handle = gem_userptr(kgem->fd, ptr, size, read_only);
+	if (handle == 0)
+		return NULL;
+
+	bo = __kgem_bo_alloc(handle, NUM_PAGES(size));
+	if (bo == NULL) {
+		gem_close(kgem->fd, handle);
+		return NULL;
+	}
+
+	bo->snoop = !kgem->has_llc;
+	debug_alloc__bo(kgem, bo);
+
+	DBG(("%s(ptr=%p, size=%d, pages=%d, read_only=%d) => handle=%d\n",
+	     __FUNCTION__, ptr, size, NUM_PAGES(size), read_only, handle));
+	return bo;
+}
+
+void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->proxy == NULL);
+	kgem_bo_submit(kgem, bo);
+
+	if (bo->domain != DOMAIN_CPU) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_CPU;
+		set_domain.write_domain = I915_GEM_DOMAIN_CPU;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_CPU;
+		}
+	}
+}
+
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->proxy == NULL);
+	kgem_bo_submit(kgem, bo);
+
+	if (bo->domain != DOMAIN_GTT) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			kgem_bo_retire(kgem, bo);
+			bo->domain = DOMAIN_GTT;
+		}
+	}
+}
+
+void kgem_clear_dirty(struct kgem *kgem)
+{
+	struct kgem_request *rq = kgem->next_request;
+	struct kgem_bo *bo;
+
+	list_for_each_entry(bo, &rq->buffers, request) {
+		if (!bo->dirty)
+			break;
+
+		bo->dirty = false;
+	}
+}
+
+struct kgem_bo *kgem_create_proxy(struct kgem *kgem,
+				  struct kgem_bo *target,
+				  int offset, int length)
+{
+	struct kgem_bo *bo;
+
+	DBG(("%s: target handle=%d [proxy? %d], offset=%d, length=%d, io=%d\n",
+	     __FUNCTION__, target->handle, target->proxy ? target->proxy->delta : -1,
+	     offset, length, target->io));
+
+	bo = __kgem_bo_alloc(target->handle, length);
+	if (bo == NULL)
+		return NULL;
+
+	bo->unique_id = kgem_get_unique_id(kgem);
+	bo->reusable = false;
+	bo->size.bytes = length;
+
+	bo->io = target->io && target->proxy == NULL;
+	bo->dirty = target->dirty;
+	bo->tiling = target->tiling;
+	bo->pitch = target->pitch;
+
+	bo->proxy = kgem_bo_reference(target);
+	bo->delta = offset;
+
+	if (target->exec) {
+		list_move_tail(&bo->request, &kgem->next_request->buffers);
+		bo->exec = &_kgem_dummy_exec;
+	}
+	bo->rq = target->rq;
+
+	return bo;
+}
+
+static struct kgem_buffer *
+buffer_alloc(void)
+{
+	struct kgem_buffer *bo;
+
+	bo = malloc(sizeof(*bo));
+	if (bo == NULL)
+		return NULL;
+
+	bo->mem = NULL;
+	bo->need_io = false;
+	bo->mmapped = true;
+
+	return bo;
+}
+
+static struct kgem_buffer *
+buffer_alloc_with_data(int num_pages)
+{
+	struct kgem_buffer *bo;
+
+	bo = malloc(sizeof(*bo) + 2*UPLOAD_ALIGNMENT + num_pages * PAGE_SIZE);
+	if (bo == NULL)
+		return NULL;
+
+	bo->mem = (void *)ALIGN((uintptr_t)bo + sizeof(*bo), UPLOAD_ALIGNMENT);
+	bo->mmapped = false;
+	return bo;
+}
+
+static inline bool
+use_snoopable_buffer(struct kgem *kgem, uint32_t flags)
+{
+	if ((flags & KGEM_BUFFER_WRITE) == 0)
+		return kgem->gen >= 30;
+
+	return true;
+}
+
+static void
+init_buffer_from_bo(struct kgem_buffer *bo, struct kgem_bo *old)
+{
+	DBG(("%s: reusing handle=%d for buffer\n",
+	     __FUNCTION__, old->handle));
+
+	assert(old->proxy == NULL);
+
+	memcpy(&bo->base, old, sizeof(*old));
+	if (old->rq)
+		list_replace(&old->request, &bo->base.request);
+	else
+		list_init(&bo->base.request);
+	list_replace(&old->vma, &bo->base.vma);
+	list_init(&bo->base.list);
+	free(old);
+
+	assert(bo->base.tiling == I915_TILING_NONE);
+
+	bo->base.refcnt = 1;
+}
+
+static struct kgem_buffer *
+search_snoopable_buffer(struct kgem *kgem, unsigned alloc)
+{
+	struct kgem_buffer *bo;
+	struct kgem_bo *old;
+
+	old = search_snoop_cache(kgem, alloc, 0);
+	if (old) {
+		if (!old->io) {
+			bo = buffer_alloc();
+			if (bo == NULL)
+				return NULL;
+
+			init_buffer_from_bo(bo, old);
+		} else {
+			bo = (struct kgem_buffer *)old;
+			bo->base.refcnt = 1;
+		}
+
+		DBG(("%s: created CPU handle=%d for buffer, size %d\n",
+		     __FUNCTION__, bo->base.handle, num_pages(&bo->base)));
+
+		assert(bo->base.snoop);
+		assert(bo->base.tiling == I915_TILING_NONE);
+		assert(num_pages(&bo->base) >= alloc);
+		assert(bo->mmapped == true);
+		assert(bo->need_io == false);
+
+		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+		if (bo->mem == NULL) {
+			bo->base.refcnt = 0;
+			kgem_bo_free(kgem, &bo->base);
+			bo = NULL;
+		}
+
+		return bo;
+	}
+
+	return NULL;
+}
+
+static struct kgem_buffer *
+create_snoopable_buffer(struct kgem *kgem, unsigned alloc)
+{
+	struct kgem_buffer *bo;
+	uint32_t handle;
+
+	assert(!kgem->has_llc);
+
+	if (kgem->has_cacheing) {
+		struct kgem_bo *old;
+
+		bo = buffer_alloc();
+		if (bo == NULL)
+			return NULL;
+
+		old = search_linear_cache(kgem, alloc,
+					 CREATE_INACTIVE | CREATE_CPU_MAP | CREATE_EXACT);
+		if (old) {
+			init_buffer_from_bo(bo, old);
+		} else {
+			handle = gem_create(kgem->fd, alloc);
+			if (handle == 0) {
+				free(bo);
+				return NULL;
+			}
+
+			debug_alloc(kgem, alloc);
+			__kgem_bo_init(&bo->base, handle, alloc);
+			DBG(("%s: created CPU handle=%d for buffer, size %d\n",
+			     __FUNCTION__, bo->base.handle, alloc));
+		}
+
+		assert(bo->base.refcnt == 1);
+		assert(bo->mmapped == true);
+		assert(bo->need_io == false);
+
+		if (!gem_set_cacheing(kgem->fd, bo->base.handle, SNOOPED))
+			goto free_cacheing;
+
+		bo->base.snoop = true;
+
+		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+		if (bo->mem == NULL)
+			goto free_cacheing;
+
+		return bo;
+
+free_cacheing:
+		bo->base.refcnt = 0; /* for valgrind */
+		kgem_bo_free(kgem, &bo->base);
+	}
+
+	if (kgem->has_userptr) {
+		bo = buffer_alloc();
+		if (bo == NULL)
+			return NULL;
+
+		//if (posix_memalign(&ptr, 64, ALIGN(size, 64)))
+		if (posix_memalign(&bo->mem, PAGE_SIZE, alloc *PAGE_SIZE)) {
+			free(bo);
+			return NULL;
+		}
+
+		handle = gem_userptr(kgem->fd, bo->mem, alloc * PAGE_SIZE, false);
+		if (handle == 0) {
+			free(bo->mem);
+			free(bo);
+			return NULL;
+		}
+
+		debug_alloc(kgem, alloc);
+		__kgem_bo_init(&bo->base, handle, alloc);
+		DBG(("%s: created snoop handle=%d for buffer\n",
+		     __FUNCTION__, bo->base.handle));
+
+		assert(bo->mmapped == true);
+		assert(bo->need_io == false);
+
+		bo->base.refcnt = 1;
+		bo->base.snoop = true;
+		bo->base.map = MAKE_USER_MAP(bo->mem);
+
+		return bo;
+	}
+
+	return NULL;
+}
+
+struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
+				   uint32_t size, uint32_t flags,
+				   void **ret)
+{
+	struct kgem_buffer *bo;
+	unsigned offset, alloc;
+	struct kgem_bo *old;
+
+	DBG(("%s: size=%d, flags=%x [write?=%d, inplace?=%d, last?=%d]\n",
+	     __FUNCTION__, size, flags,
+	     !!(flags & KGEM_BUFFER_WRITE),
+	     !!(flags & KGEM_BUFFER_INPLACE),
+	     !!(flags & KGEM_BUFFER_LAST)));
+	assert(size);
+	/* we should never be asked to create anything TOO large */
+	assert(size <= kgem->max_object_size);
+
+	if (kgem->has_llc)
+		flags &= ~KGEM_BUFFER_INPLACE;
+
+#if !DBG_NO_UPLOAD_CACHE
+	list_for_each_entry(bo, &kgem->batch_buffers, base.list) {
+		assert(bo->base.io);
+		assert(bo->base.refcnt >= 1);
+
+		/* We can reuse any write buffer which we can fit */
+		if (flags == KGEM_BUFFER_LAST &&
+		    bo->write == KGEM_BUFFER_WRITE &&
+		    bo->base.refcnt == 1 && !bo->mmapped &&
+		    size <= bytes(&bo->base)) {
+			DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
+			     __FUNCTION__, size, bo->used, bytes(&bo->base)));
+			gem_write(kgem->fd, bo->base.handle,
+				  0, bo->used, bo->mem);
+			kgem_buffer_release(kgem, bo);
+			bo->need_io = 0;
+			bo->write = 0;
+			offset = 0;
+			bo->used = size;
+			goto done;
+		}
+
+		if (flags & KGEM_BUFFER_WRITE) {
+			if ((bo->write & KGEM_BUFFER_WRITE) == 0 ||
+			    (((bo->write & ~flags) & KGEM_BUFFER_INPLACE) &&
+			     !bo->base.snoop)) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
+			assert(bo->mmapped || bo->need_io);
+		} else {
+			if (bo->write & KGEM_BUFFER_WRITE) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
+		}
+
+		if (bo->used + size <= bytes(&bo->base)) {
+			DBG(("%s: reusing buffer? used=%d + size=%d, total=%d\n",
+			     __FUNCTION__, bo->used, size, bytes(&bo->base)));
+			offset = bo->used;
+			bo->used += size;
+			goto done;
+		}
+	}
+
+	if (flags & KGEM_BUFFER_WRITE) {
+		list_for_each_entry(bo, &kgem->active_buffers, base.list) {
+			assert(bo->base.io);
+			assert(bo->base.refcnt >= 1);
+			assert(bo->mmapped);
+			assert(!bo->base.snoop);
+			assert(!IS_CPU_MAP(bo->base.map) || kgem->has_llc);
+
+			if ((bo->write & ~flags) & KGEM_BUFFER_INPLACE) {
+				DBG(("%s: skip write %x buffer, need %x\n",
+				     __FUNCTION__, bo->write, flags));
+				continue;
+			}
+
+			if (bo->used + size <= bytes(&bo->base)) {
+				DBG(("%s: reusing buffer? used=%d + size=%d, total=%d\n",
+				     __FUNCTION__, bo->used, size, bytes(&bo->base)));
+				offset = bo->used;
+				bo->used += size;
+				list_move(&bo->base.list, &kgem->batch_buffers);
+				goto done;
+			}
+		}
+	}
+#endif
+
+#if !DBG_NO_MAP_UPLOAD
+	/* Be a little more generous and hope to hold fewer mmappings */
+	alloc = ALIGN(2*size, kgem->buffer_size);
+	if (alloc > MAX_CACHE_SIZE)
+		alloc = ALIGN(size, kgem->buffer_size);
+	if (alloc > MAX_CACHE_SIZE)
+		alloc = PAGE_ALIGN(size);
+	alloc /= PAGE_SIZE;
+	if (kgem->has_llc) {
+		bo = buffer_alloc();
+		if (bo == NULL)
+			return NULL;
+
+		old = NULL;
+		if ((flags & KGEM_BUFFER_WRITE) == 0)
+			old = search_linear_cache(kgem, alloc, CREATE_CPU_MAP);
+		if (old == NULL)
+			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE | CREATE_CPU_MAP);
+		if (old == NULL)
+			old = search_linear_cache(kgem, NUM_PAGES(size), CREATE_INACTIVE | CREATE_CPU_MAP);
+		if (old) {
+			DBG(("%s: found LLC handle=%d for buffer\n",
+			     __FUNCTION__, old->handle));
+
+			init_buffer_from_bo(bo, old);
+		} else {
+			uint32_t handle = gem_create(kgem->fd, alloc);
+			if (handle == 0) {
+				free(bo);
+				return NULL;
+			}
+			__kgem_bo_init(&bo->base, handle, alloc);
+			DBG(("%s: created LLC handle=%d for buffer\n",
+			     __FUNCTION__, bo->base.handle));
+
+			debug_alloc(kgem, alloc);
+		}
+
+		assert(bo->mmapped);
+		assert(!bo->need_io);
+
+		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+		if (bo->mem) {
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
+			alloc = num_pages(&bo->base);
+			goto init;
+		} else {
+			bo->base.refcnt = 0; /* for valgrind */
+			kgem_bo_free(kgem, &bo->base);
+		}
+	}
+
+	if (PAGE_SIZE * alloc > kgem->aperture_mappable / 4)
+		flags &= ~KGEM_BUFFER_INPLACE;
+
+	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
+		/* The issue with using a GTT upload buffer is that we may
+		 * cause eviction-stalls in order to free up some GTT space.
+		 * An is-mappable? ioctl could help us detect when we are
+		 * about to block, or some per-page magic in the kernel.
+		 *
+		 * XXX This is especially noticeable on memory constrained
+		 * devices like gen2 or with relatively slow gpu like i3.
+		 */
+		DBG(("%s: searching for an inactive GTT map for upload\n",
+		     __FUNCTION__));
+		old = search_linear_cache(kgem, alloc,
+					  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
+#if HAVE_I915_GEM_BUFFER_INFO
+		if (old) {
+			struct drm_i915_gem_buffer_info info;
+
+			/* An example of such a non-blocking ioctl might work */
+
+			VG_CLEAR(info);
+			info.handle = handle;
+			if (drmIoctl(kgem->fd,
+				     DRM_IOCTL_I915_GEM_BUFFER_INFO,
+				     &fino) == 0) {
+				old->presumed_offset = info.addr;
+				if ((info.flags & I915_GEM_MAPPABLE) == 0) {
+					kgem_bo_move_to_inactive(kgem, old);
+					old = NULL;
+				}
+			}
+		}
+#endif
+		if (old == NULL)
+			old = search_linear_cache(kgem, NUM_PAGES(size),
+						  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
+		if (old == NULL) {
+			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
+			if (old && !kgem_bo_is_mappable(kgem, old)) {
+				_kgem_bo_destroy(kgem, old);
+				old = NULL;
+			}
+		}
+		if (old) {
+			DBG(("%s: reusing handle=%d for buffer\n",
+			     __FUNCTION__, old->handle));
+			assert(kgem_bo_is_mappable(kgem, old));
+			assert(!old->snoop);
+			assert(old->rq == NULL);
+
+			bo = buffer_alloc();
+			if (bo == NULL)
+				return NULL;
+
+			init_buffer_from_bo(bo, old);
+			assert(num_pages(&bo->base) >= NUM_PAGES(size));
+
+			assert(bo->mmapped);
+			assert(bo->base.refcnt == 1);
+
+			bo->mem = kgem_bo_map(kgem, &bo->base);
+			if (bo->mem) {
+				alloc = num_pages(&bo->base);
+				if (IS_CPU_MAP(bo->base.map))
+				    flags &= ~KGEM_BUFFER_INPLACE;
+				goto init;
+			} else {
+				bo->base.refcnt = 0;
+				kgem_bo_free(kgem, &bo->base);
+			}
+		}
+	}
+#else
+	flags &= ~KGEM_BUFFER_INPLACE;
+#endif
+	/* Be more parsimonious with pwrite/pread/cacheable buffers */
+	if ((flags & KGEM_BUFFER_INPLACE) == 0)
+		alloc = NUM_PAGES(size);
+
+	if (use_snoopable_buffer(kgem, flags)) {
+		bo = search_snoopable_buffer(kgem, alloc);
+		if (bo) {
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+			flags &= ~KGEM_BUFFER_INPLACE;
+			alloc = num_pages(&bo->base);
+			goto init;
+		}
+
+		if ((flags & KGEM_BUFFER_WRITE_INPLACE) != KGEM_BUFFER_WRITE_INPLACE) {
+			bo = create_snoopable_buffer(kgem, alloc);
+			if (bo) {
+				flags &= ~KGEM_BUFFER_INPLACE;
+				goto init;
+			}
+		}
+	}
+
+	flags &= ~KGEM_BUFFER_INPLACE;
+
+	old = NULL;
+	if ((flags & KGEM_BUFFER_WRITE) == 0)
+		old = search_linear_cache(kgem, alloc, 0);
+	if (old == NULL)
+		old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
+	if (old) {
+		DBG(("%s: reusing ordinary handle %d for io\n",
+		     __FUNCTION__, old->handle));
+		alloc = num_pages(old);
+		bo = buffer_alloc_with_data(alloc);
+		if (bo == NULL)
+			return NULL;
+
+		init_buffer_from_bo(bo, old);
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
+	} else {
+		unsigned hint;
+
+		if (use_snoopable_buffer(kgem, flags)) {
+			bo = create_snoopable_buffer(kgem, alloc);
+			if (bo)
+				goto init;
+		}
+
+		bo = buffer_alloc();
+		if (bo == NULL)
+			return NULL;
+
+		hint = CREATE_INACTIVE;
+		if (flags & KGEM_BUFFER_WRITE)
+			hint |= CREATE_CPU_MAP;
+		old = search_linear_cache(kgem, alloc, hint);
+		if (old) {
+			DBG(("%s: reusing handle=%d for buffer\n",
+			     __FUNCTION__, old->handle));
+
+			alloc = num_pages(old);
+			init_buffer_from_bo(bo, old);
+		} else {
+			uint32_t handle = gem_create(kgem->fd, alloc);
+			if (handle == 0) {
+				free(bo);
+				return NULL;
+			}
+
+			DBG(("%s: created handle=%d for buffer\n",
+			     __FUNCTION__, bo->base.handle));
+
+			__kgem_bo_init(&bo->base, handle, alloc);
+			debug_alloc(kgem, alloc * PAGE_SIZE);
+		}
+
+		assert(bo->mmapped);
+		assert(!bo->need_io);
+		assert(bo->base.refcnt == 1);
+
+		if (flags & KGEM_BUFFER_WRITE) {
+			bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
+			if (bo->mem != NULL)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+			goto init;
+		}
+
+		DBG(("%s: failing back to new pwrite buffer\n", __FUNCTION__));
+		old = &bo->base;
+		bo = buffer_alloc_with_data(alloc);
+		if (bo == NULL) {
+			free(old);
+			return NULL;
+		}
+
+		init_buffer_from_bo(bo, old);
+
+		assert(bo->mem);
+		assert(!bo->mmapped);
+		assert(bo->base.refcnt == 1);
+
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
+	}
+init:
+	bo->base.io = true;
+	assert(bo->base.refcnt == 1);
+	assert(num_pages(&bo->base) == alloc);
+	assert(!bo->need_io || !bo->base.needs_flush);
+	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
+	assert(bo->mem);
+	assert(!bo->mmapped || bo->base.map != NULL);
+
+	bo->used = size;
+	bo->write = flags & KGEM_BUFFER_WRITE_INPLACE;
+	offset = 0;
+
+	assert(list_is_empty(&bo->base.list));
+	list_add(&bo->base.list, &kgem->batch_buffers);
+
+	DBG(("%s(pages=%d) new handle=%d, used=%d, write=%d\n",
+	     __FUNCTION__, alloc, bo->base.handle, bo->used, bo->write));
+
+done:
+	bo->used = ALIGN(bo->used, UPLOAD_ALIGNMENT);
+	assert(bo->mem);
+	*ret = (char *)bo->mem + offset;
+	return kgem_create_proxy(kgem, &bo->base, offset, size);
+}
+
+bool kgem_buffer_is_inplace(struct kgem_bo *_bo)
+{
+	struct kgem_buffer *bo = (struct kgem_buffer *)_bo->proxy;
+	return bo->write & KGEM_BUFFER_WRITE_INPLACE;
+}
+
+struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
+				      int width, int height, int bpp,
+				      uint32_t flags,
+				      void **ret)
+{
+	struct kgem_bo *bo;
+	int stride;
+
+	assert(width > 0 && height > 0);
+	assert(ret != NULL);
+	stride = ALIGN(width, 2) * bpp >> 3;
+	stride = ALIGN(stride, 4);
+
+	DBG(("%s: %dx%d, %d bpp, stride=%d\n",
+	     __FUNCTION__, width, height, bpp, stride));
+
+	bo = kgem_create_buffer(kgem, stride * ALIGN(height, 2), flags, ret);
+	if (bo == NULL) {
+		DBG(("%s: allocation failure for upload buffer\n",
+		     __FUNCTION__));
+		return NULL;
+	}
+	assert(*ret != NULL);
+
+	if (height & 1) {
+		struct kgem_buffer *io = (struct kgem_buffer *)bo->proxy;
+		int min;
+
+		assert(io->used);
+
+		/* Having padded this surface to ensure that accesses to
+		 * the last pair of rows is valid, remove the padding so
+		 * that it can be allocated to other pixmaps.
+		 */
+		min = bo->delta + height * stride;
+		min = ALIGN(min, UPLOAD_ALIGNMENT);
+		if (io->used != min) {
+			DBG(("%s: trimming buffer from %d to %d\n",
+			     __FUNCTION__, io->used, min));
+			io->used = min;
+		}
+		bo->size.bytes -= stride;
+	}
+
+	bo->pitch = stride;
+	bo->unique_id = kgem_get_unique_id(kgem);
+	return bo;
+}
+
+struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
+					 const void *data,
+					 const BoxRec *box,
+					 int stride, int bpp)
+{
+	int width  = box->x2 - box->x1;
+	int height = box->y2 - box->y1;
+	struct kgem_bo *bo;
+	void *dst;
+
+	DBG(("%s : (%d, %d), (%d, %d), stride=%d, bpp=%d\n",
+	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2, stride, bpp));
+
+	assert(data);
+	assert(width > 0);
+	assert(height > 0);
+	assert(stride);
+	assert(bpp);
+
+	bo = kgem_create_buffer_2d(kgem,
+				   width, height, bpp,
+				   KGEM_BUFFER_WRITE_INPLACE, &dst);
+	if (bo)
+		memcpy_blt(data, dst, bpp,
+			   stride, bo->pitch,
+			   box->x1, box->y1,
+			   0, 0,
+			   width, height);
+
+	return bo;
+}
+
+void kgem_proxy_bo_attach(struct kgem_bo *bo,
+			  struct kgem_bo **ptr)
+{
+	DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+	assert(bo->map == NULL);
+	assert(bo->proxy);
+	list_add(&bo->vma, &bo->proxy->vma);
+	bo->map = ptr;
+	*ptr = kgem_bo_reference(bo);
+}
+
+void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
+{
+	struct kgem_buffer *bo;
+	uint32_t offset = _bo->delta, length = _bo->size.bytes;
+
+	/* We expect the caller to have already submitted the batch */
+	assert(_bo->io);
+	assert(_bo->exec == NULL);
+	assert(_bo->rq == NULL);
+	assert(_bo->proxy);
+
+	_bo = _bo->proxy;
+	assert(_bo->proxy == NULL);
+	assert(_bo->exec == NULL);
+
+	bo = (struct kgem_buffer *)_bo;
+
+	DBG(("%s(offset=%d, length=%d, snooped=%d)\n", __FUNCTION__,
+	     offset, length, bo->base.snoop));
+
+	if (bo->mmapped) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n",
+		     __FUNCTION__,
+		     bo->base.needs_flush,
+		     bo->base.domain,
+		     kgem_busy(kgem, bo->base.handle)));
+
+		assert(!IS_CPU_MAP(bo->base.map) || bo->base.snoop || kgem->has_llc);
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->base.handle;
+		set_domain.write_domain = 0;
+		set_domain.read_domains =
+			IS_CPU_MAP(bo->base.map) ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
+
+		if (drmIoctl(kgem->fd,
+			     DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))
+			return;
+	} else {
+		if (gem_read(kgem->fd,
+			     bo->base.handle, (char *)bo->mem+offset,
+			     offset, length))
+			return;
+	}
+	kgem_bo_retire(kgem, &bo->base);
+}
+
+uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format)
+{
+	struct kgem_bo_binding *b;
+
+	for (b = &bo->binding; b && b->offset; b = b->next)
+		if (format == b->format)
+			return b->offset;
+
+	return 0;
+}
+
+void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
+{
+	struct kgem_bo_binding *b;
+
+	for (b = &bo->binding; b; b = b->next) {
+		if (b->offset)
+			continue;
+
+		b->offset = offset;
+		b->format = format;
+
+		if (b->next)
+			b->next->offset = 0;
+
+		return;
+	}
+
+	b = malloc(sizeof(*b));
+	if (b) {
+		b->next = bo->binding.next;
+		b->format = format;
+		b->offset = offset;
+		bo->binding.next = b;
+	}
+}
+
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_get_tiling tiling;
+
+	VG_CLEAR(tiling);
+	tiling.handle = bo->handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
+		return 0;
+
+	assert(bo->tiling == tiling.tiling_mode);
+	return tiling.swizzle_mode;
+}
+
+struct kgem_bo *
+kgem_replace_bo(struct kgem *kgem,
+		struct kgem_bo *src,
+		uint32_t width,
+		uint32_t height,
+		uint32_t pitch,
+		uint32_t bpp)
+{
+	struct kgem_bo *dst;
+	uint32_t br00, br13;
+	uint32_t handle;
+	uint32_t size;
+	uint32_t *b;
+
+	DBG(("%s: replacing bo handle=%d, size=%dx%d pitch=%d, with pitch=%d\n",
+	     __FUNCTION__, src->handle,  width, height, src->pitch, pitch));
+
+	/* We only expect to be called to fixup small buffers, hence why
+	 * we only attempt to allocate a linear bo.
+	 */
+	assert(src->tiling == I915_TILING_NONE);
+
+	size = height * pitch;
+	size = PAGE_ALIGN(size) / PAGE_SIZE;
+
+	dst = search_linear_cache(kgem, size, 0);
+	if (dst == NULL)
+		dst = search_linear_cache(kgem, size, CREATE_INACTIVE);
+	if (dst == NULL) {
+		handle = gem_create(kgem->fd, size);
+		if (handle == 0)
+			return NULL;
+
+		dst = __kgem_bo_alloc(handle, size);
+		if (dst== NULL) {
+			gem_close(kgem->fd, handle);
+			return NULL;
+		}
+
+		debug_alloc__bo(kgem, dst);
+	}
+	dst->pitch = pitch;
+	dst->unique_id = kgem_get_unique_id(kgem);
+	dst->refcnt = 1;
+
+	kgem_set_mode(kgem, KGEM_BLT);
+	if (!kgem_check_batch(kgem, 8) ||
+	    !kgem_check_reloc(kgem, 2) ||
+	    !kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
+		_kgem_submit(kgem);
+		_kgem_set_mode(kgem, KGEM_BLT);
+	}
+
+	br00 = XY_SRC_COPY_BLT_CMD;
+	br13 = pitch;
+	pitch = src->pitch;
+	if (kgem->gen >= 40 && src->tiling) {
+		br00 |= BLT_SRC_TILED;
+		pitch >>= 2;
+	}
+
+	br13 |= 0xcc << 16;
+	switch (bpp) {
+	default:
+	case 32: br00 |= BLT_WRITE_ALPHA | BLT_WRITE_RGB;
+		 br13 |= 1 << 25; /* RGB8888 */
+	case 16: br13 |= 1 << 24; /* RGB565 */
+	case 8: break;
+	}
+
+	b = kgem->batch + kgem->nbatch;
+	b[0] = br00;
+	b[1] = br13;
+	b[2] = 0;
+	b[3] = height << 16 | width;
+	b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, dst,
+			      I915_GEM_DOMAIN_RENDER << 16 |
+			      I915_GEM_DOMAIN_RENDER |
+			      KGEM_RELOC_FENCED,
+			      0);
+	b[5] = 0;
+	b[6] = pitch;
+	b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src,
+			      I915_GEM_DOMAIN_RENDER << 16 |
+			      KGEM_RELOC_FENCED,
+			      0);
+	kgem->nbatch += 8;
+
+	return dst;
+}
diff --git a/cogl/driver/drm/kgem.h b/cogl/driver/drm/kgem.h
new file mode 100644
index 00000000..fcc7f1c0
--- /dev/null
+++ b/cogl/driver/drm/kgem.h
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef KGEM_H
+#define KGEM_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#include <i915_drm.h>
+
+#include "compiler.h"
+#include "intel_list.h"
+
+#include "cogl-private.h"
+
+#if HAS_DEBUG_FULL
+#define DBG(x) ErrorF x
+#else
+#define DBG(x)
+#endif
+
+struct kgem_bo {
+	struct kgem_bo *proxy;
+
+	struct list list;
+	struct list request;
+	struct list vma;
+
+	void *map;
+#define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
+#define IS_GTT_MAP(ptr) (ptr && ((uintptr_t)(ptr) & 1) == 0)
+	struct kgem_request *rq;
+	struct drm_i915_gem_exec_object2 *exec;
+
+	struct kgem_bo_binding {
+		struct kgem_bo_binding *next;
+		uint32_t format;
+		uint16_t offset;
+	} binding;
+
+	uint32_t unique_id;
+	uint32_t refcnt;
+	uint32_t handle;
+	uint32_t presumed_offset;
+	uint32_t delta;
+	union {
+		struct {
+			uint32_t count:27;
+#define PAGE_SIZE 4096
+			uint32_t bucket:5;
+#define NUM_CACHE_BUCKETS 16
+#define MAX_CACHE_SIZE (1 << (NUM_CACHE_BUCKETS+12))
+		} pages;
+		uint32_t bytes;
+	} size;
+	uint32_t pitch : 18; /* max 128k */
+	uint32_t tiling : 2;
+	uint32_t reusable : 1;
+	uint32_t dirty : 1;
+	uint32_t domain : 2;
+	uint32_t needs_flush : 1;
+	uint32_t snoop : 1;
+	uint32_t io : 1;
+	uint32_t flush : 1;
+	uint32_t scanout : 1;
+	uint32_t purged : 1;
+};
+#define DOMAIN_NONE 0
+#define DOMAIN_CPU 1
+#define DOMAIN_GTT 2
+#define DOMAIN_GPU 3
+
+struct kgem_request {
+	struct list list;
+	struct kgem_bo *bo;
+	struct list buffers;
+	int ring;
+};
+
+enum {
+	MAP_GTT = 0,
+	MAP_CPU,
+	NUM_MAP_TYPES,
+};
+
+struct kgem {
+	int fd;
+	int wedged;
+	unsigned gen;
+
+	uint32_t unique_id;
+
+	enum kgem_mode {
+		/* order matches I915_EXEC_RING ordering */
+		KGEM_NONE = 0,
+		KGEM_RENDER,
+		KGEM_BSD,
+		KGEM_BLT,
+	} mode, ring;
+
+	struct list flushing;
+	struct list large;
+	struct list large_inactive;
+	struct list active[NUM_CACHE_BUCKETS][3];
+	struct list inactive[NUM_CACHE_BUCKETS];
+	struct list snoop;
+	struct list batch_buffers, active_buffers;
+
+	struct list requests[2];
+	struct kgem_request *next_request;
+	uint32_t num_requests;
+
+	struct {
+		struct list inactive[NUM_CACHE_BUCKETS];
+		int16_t count;
+	} vma[NUM_MAP_TYPES];
+
+	uint32_t batch_flags;
+#define I915_EXEC_SECURE (1<<9)
+
+	uint16_t nbatch;
+	uint16_t surface;
+	uint16_t nexec;
+	uint16_t nreloc;
+	uint16_t nfence;
+	uint16_t batch_size;
+	uint16_t min_alignment;
+
+	uint32_t flush:1;
+	uint32_t need_expire:1;
+	uint32_t need_purge:1;
+	uint32_t need_retire:1;
+	uint32_t need_throttle:1;
+	uint32_t busy:1;
+
+	uint32_t has_userptr :1;
+	uint32_t has_blt :1;
+	uint32_t has_relaxed_fencing :1;
+	uint32_t has_relaxed_delta :1;
+	uint32_t has_semaphores :1;
+	uint32_t has_secure_batches :1;
+	uint32_t has_cacheing :1;
+	uint32_t has_llc :1;
+
+	uint32_t can_blt_cpu :1;
+
+	uint16_t fence_max;
+	uint16_t half_cpu_cache_pages;
+	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
+	uint32_t aperture, aperture_fenced;
+	uint32_t max_upload_tile_size, max_copy_tile_size;
+	uint32_t max_gpu_size, max_cpu_size;
+	uint32_t large_object_size, max_object_size;
+	uint32_t buffer_size;
+
+	void (*context_switch)(struct kgem *kgem, int new_mode);
+	void (*retire)(struct kgem *kgem);
+	void (*expire)(struct kgem *kgem);
+
+	uint32_t batch[64*1024-8];
+	struct drm_i915_gem_exec_object2 exec[256];
+	struct drm_i915_gem_relocation_entry reloc[4096];
+
+#ifdef DEBUG_MEMORY
+	struct {
+		int bo_allocs;
+		size_t bo_bytes;
+	} debug_memory;
+#endif
+};
+
+#define KGEM_BATCH_RESERVED 1
+#define KGEM_RELOC_RESERVED 4
+#define KGEM_EXEC_RESERVED 1
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
+#endif
+
+#define KGEM_BATCH_SIZE(K) ((K)->batch_size-KGEM_BATCH_RESERVED)
+#define KGEM_EXEC_SIZE(K) (int)(ARRAY_SIZE((K)->exec)-KGEM_EXEC_RESERVED)
+#define KGEM_RELOC_SIZE(K) (int)(ARRAY_SIZE((K)->reloc)-KGEM_RELOC_RESERVED)
+
+void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen);
+void kgem_reset(struct kgem *kgem);
+
+struct kgem_bo *kgem_create_map(struct kgem *kgem,
+				void *ptr, uint32_t size,
+				bool read_only);
+
+struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name);
+struct kgem_bo *kgem_create_for_prime(struct kgem *kgem, int name, uint32_t size);
+int kgem_bo_export_to_prime(struct kgem *kgem, struct kgem_bo *bo);
+
+struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags);
+struct kgem_bo *kgem_create_proxy(struct kgem *kgem,
+				  struct kgem_bo *target,
+				  int offset, int length);
+
+struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
+					 const void *data,
+					 const BoxRec *box,
+					 int stride, int bpp);
+void kgem_proxy_bo_attach(struct kgem_bo *bo, struct kgem_bo **ptr);
+
+int kgem_choose_tiling(struct kgem *kgem,
+		       int tiling, int width, int height, int bpp);
+unsigned kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
+#define KGEM_CAN_CREATE_GPU	0x1
+#define KGEM_CAN_CREATE_CPU	0x2
+#define KGEM_CAN_CREATE_LARGE	0x4
+#define KGEM_CAN_CREATE_GTT	0x8
+
+struct kgem_bo *
+kgem_replace_bo(struct kgem *kgem,
+		struct kgem_bo *src,
+		uint32_t width,
+		uint32_t height,
+		uint32_t pitch,
+		uint32_t bpp);
+enum {
+	CREATE_EXACT = 0x1,
+	CREATE_INACTIVE = 0x2,
+	CREATE_CPU_MAP = 0x4,
+	CREATE_GTT_MAP = 0x8,
+	CREATE_SCANOUT = 0x10,
+	CREATE_PRIME = 0x20,
+	CREATE_TEMPORARY = 0x40,
+	CREATE_NO_RETIRE = 0x80,
+	CREATE_NO_THROTTLE = 0x100,
+};
+struct kgem_bo *kgem_create_2d(struct kgem *kgem,
+			       int width,
+			       int height,
+			       int bpp,
+			       int tiling,
+			       uint32_t flags);
+struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
+				   int width,
+				   int height,
+				   int bpp,
+				   uint32_t flags);
+
+uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
+void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
+
+void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo);
+bool kgem_retire(struct kgem *kgem);
+bool __kgem_is_idle(struct kgem *kgem);
+static inline bool kgem_is_idle(struct kgem *kgem)
+{
+	if (kgem->num_requests == 0) {
+		DBG(("%s: no outstanding requests\n", __FUNCTION__));
+		return true;
+	}
+
+	return __kgem_is_idle(kgem);
+}
+
+void _kgem_submit(struct kgem *kgem);
+static inline void kgem_submit(struct kgem *kgem)
+{
+	if (kgem->nbatch)
+		_kgem_submit(kgem);
+}
+
+static inline bool kgem_flush(struct kgem *kgem)
+{
+	return kgem->flush && kgem_is_idle(kgem);
+}
+
+static inline void kgem_bo_submit(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (bo->exec)
+		_kgem_submit(kgem);
+}
+
+bool __kgem_flush(struct kgem *kgem, struct kgem_bo *bo);
+static inline void kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
+{
+	kgem_bo_submit(kgem, bo);
+
+	if (!bo->needs_flush)
+		return;
+
+	/* If the kernel fails to emit the flush, then it will be forced when
+	 * we assume direct access. And as the useual failure is EIO, we do
+	 * not actualy care.
+	 */
+	(void)__kgem_flush(kgem, bo);
+}
+
+static inline struct kgem_bo *kgem_bo_reference(struct kgem_bo *bo)
+{
+	assert(bo->refcnt);
+	bo->refcnt++;
+	return bo;
+}
+
+void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo);
+static inline void kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->refcnt);
+	if (--bo->refcnt == 0)
+		_kgem_bo_destroy(kgem, bo);
+}
+
+void kgem_clear_dirty(struct kgem *kgem);
+
+static inline void kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
+{
+	assert(!kgem->wedged);
+
+#if DEBUG_FLUSH_BATCH
+	kgem_submit(kgem);
+#endif
+
+	if (kgem->mode == mode)
+		return;
+
+	kgem->context_switch(kgem, mode);
+	kgem->mode = mode;
+}
+
+static inline void _kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
+{
+	assert(kgem->mode == KGEM_NONE);
+	assert(kgem->nbatch == 0);
+	kgem->context_switch(kgem, mode);
+	kgem->mode = mode;
+}
+
+static inline bool kgem_check_batch(struct kgem *kgem, int num_dwords)
+{
+	assert(num_dwords > 0);
+	assert(kgem->nbatch < kgem->surface);
+	assert(kgem->surface <= kgem->batch_size);
+	return likely(kgem->nbatch + num_dwords + KGEM_BATCH_RESERVED <= kgem->surface);
+}
+
+static inline bool kgem_check_reloc(struct kgem *kgem, int n)
+{
+	assert(kgem->nreloc <= KGEM_RELOC_SIZE(kgem));
+	return likely(kgem->nreloc + n <= KGEM_RELOC_SIZE(kgem));
+}
+
+static inline bool kgem_check_exec(struct kgem *kgem, int n)
+{
+	assert(kgem->nexec <= KGEM_EXEC_SIZE(kgem));
+	return likely(kgem->nexec + n <= KGEM_EXEC_SIZE(kgem));
+}
+
+static inline bool kgem_check_reloc_and_exec(struct kgem *kgem, int n)
+{
+	return kgem_check_reloc(kgem, n) && kgem_check_exec(kgem, n);
+}
+
+static inline bool kgem_check_batch_with_surfaces(struct kgem *kgem,
+						  int num_dwords,
+						  int num_surfaces)
+{
+	return (int)(kgem->nbatch + num_dwords + KGEM_BATCH_RESERVED) <= (int)(kgem->surface - num_surfaces*8) &&
+		kgem_check_reloc(kgem, num_surfaces) &&
+		kgem_check_exec(kgem, num_surfaces);
+}
+
+static inline uint32_t *kgem_get_batch(struct kgem *kgem, int num_dwords)
+{
+	if (!kgem_check_batch(kgem, num_dwords)) {
+		unsigned mode = kgem->mode;
+		_kgem_submit(kgem);
+		_kgem_set_mode(kgem, mode);
+	}
+
+	return kgem->batch + kgem->nbatch;
+}
+
+static inline void kgem_advance_batch(struct kgem *kgem, int num_dwords)
+{
+	kgem->nbatch += num_dwords;
+}
+
+bool kgem_check_bo(struct kgem *kgem, ...) __attribute__((sentinel(0)));
+bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo);
+bool kgem_check_many_bo_fenced(struct kgem *kgem, ...) __attribute__((sentinel(0)));
+
+void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo);
+static inline void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (bo->proxy)
+		bo = bo->proxy;
+
+	if (bo->exec == NULL)
+		_kgem_add_bo(kgem, bo);
+}
+
+#define KGEM_RELOC_FENCED 0x8000
+uint32_t kgem_add_reloc(struct kgem *kgem,
+			uint32_t pos,
+			struct kgem_bo *bo,
+			uint32_t read_write_domains,
+			uint32_t delta);
+
+void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo);
+void *kgem_bo_map__async(struct kgem *kgem, struct kgem_bo *bo);
+void *kgem_bo_map__gtt(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
+void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
+void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
+uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
+
+bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
+		   const void *data, int length);
+
+int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_get_tile_size(struct kgem *kgem, int tiling,
+			int *tile_width, int *tile_height, int *tile_size);
+
+static inline int __kgem_buffer_size(struct kgem_bo *bo)
+{
+	assert(bo->proxy != NULL);
+	return bo->size.bytes;
+}
+
+static inline int __kgem_bo_size(struct kgem_bo *bo)
+{
+	assert(bo->proxy == NULL);
+	return PAGE_SIZE * bo->size.pages.count;
+}
+
+static inline int kgem_bo_size(struct kgem_bo *bo)
+{
+	if (bo->proxy)
+		return __kgem_buffer_size(bo);
+	else
+		return __kgem_bo_size(bo);
+}
+
+static inline bool kgem_bo_blt_pitch_is_ok(struct kgem *kgem,
+					   struct kgem_bo *bo)
+{
+	int pitch = bo->pitch;
+	if (kgem->gen >= 40 && bo->tiling)
+		pitch /= 4;
+	if (pitch > G_MAXSHORT) {
+		DBG(("%s: can not blt to handle=%d, adjusted pitch=%d\n",
+		     __FUNCTION__, bo->handle, pitch));
+		return false;
+	}
+
+	return true;
+}
+
+static inline bool kgem_bo_can_blt(struct kgem *kgem,
+				   struct kgem_bo *bo)
+{
+	if (bo->tiling == I915_TILING_Y) {
+		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+		     __FUNCTION__, bo->handle));
+		return false;
+	}
+
+	return kgem_bo_blt_pitch_is_ok(kgem, bo);
+}
+
+static inline bool kgem_bo_is_mappable(struct kgem *kgem,
+				       struct kgem_bo *bo)
+{
+	DBG(("%s: domain=%d, offset: %d size: %d\n",
+	     __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
+
+	if (bo->domain == DOMAIN_GTT)
+		return true;
+
+	if (kgem->gen < 40 && bo->tiling &&
+	    bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
+		return false;
+
+	if (!bo->presumed_offset)
+		return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
+
+	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
+}
+
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: map=%p, tiling=%d, domain=%d\n",
+	     __FUNCTION__, bo->map, bo->tiling, bo->domain));
+
+	if (bo->map == NULL)
+		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
+
+	return IS_CPU_MAP(bo->map) == !bo->tiling;
+}
+
+static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (kgem_bo_mapped(kgem, bo))
+		return true;
+
+	if (!bo->tiling && kgem->has_llc)
+		return true;
+
+	if (kgem->gen == 21 && bo->tiling == I915_TILING_Y)
+		return false;
+
+	return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
+}
+
+static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
+{
+	while (bo->proxy)
+		bo = bo->proxy;
+	return bo->snoop;
+}
+
+static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
+	     bo->handle, bo->domain, bo->exec != NULL, bo->rq != NULL));
+	return bo->rq;
+}
+
+static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
+	     bo->handle, bo->domain, bo->exec != NULL, bo->rq != NULL));
+	if (kgem_flush(kgem))
+		kgem_submit(kgem);
+	if (bo->rq && !bo->exec)
+		kgem_retire(kgem);
+	return kgem_bo_is_busy(bo);
+}
+
+static inline bool kgem_bo_is_dirty(struct kgem_bo *bo)
+{
+	if (bo == NULL)
+		return false;
+
+	return bo->dirty;
+}
+
+static inline void kgem_bo_mark_dirty(struct kgem_bo *bo)
+{
+	do {
+		if (bo->dirty)
+			return;
+
+		DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
+		assert(bo->exec);
+		assert(bo->rq);
+
+		bo->needs_flush = bo->dirty = true;
+		list_move(&bo->request, &bo->rq->buffers);
+	} while ((bo = bo->proxy));
+}
+
+#define KGEM_BUFFER_WRITE	0x1
+#define KGEM_BUFFER_INPLACE	0x2
+#define KGEM_BUFFER_LAST	0x4
+
+#define KGEM_BUFFER_WRITE_INPLACE (KGEM_BUFFER_WRITE | KGEM_BUFFER_INPLACE)
+
+struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
+				   uint32_t size, uint32_t flags,
+				   void **ret);
+struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
+				      int width, int height, int bpp,
+				      uint32_t flags,
+				      void **ret);
+bool kgem_buffer_is_inplace(struct kgem_bo *bo);
+void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *bo);
+
+void kgem_throttle(struct kgem *kgem);
+#define MAX_INACTIVE_TIME 10
+bool kgem_expire_cache(struct kgem *kgem);
+void kgem_purge_cache(struct kgem *kgem);
+void kgem_cleanup_cache(struct kgem *kgem);
+
+#if HAS_EXTRA_DEBUG
+void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch);
+#else
+static inline void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch)
+{
+	(void)kgem;
+	(void)nbatch;
+}
+#endif
+
+#endif /* KGEM_H */
diff --git a/cogl/driver/drm/kgem_debug.c b/cogl/driver/drm/kgem_debug.c
new file mode 100644
index 00000000..2dc1b456
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include "kgem_debug.h"
+
+struct drm_i915_gem_relocation_entry *
+kgem_debug_get_reloc_entry(struct kgem *kgem, uint32_t offset)
+{
+	int i;
+
+	offset *= sizeof(uint32_t);
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == offset)
+			return kgem->reloc+i;
+
+	assert(!"valid relocation entry, unknown batch offset");
+	return NULL;
+}
+
+struct kgem_bo *
+kgem_debug_get_bo_for_reloc_entry(struct kgem *kgem,
+				  struct drm_i915_gem_relocation_entry *reloc)
+{
+	struct kgem_bo *bo;
+
+	if (reloc == NULL)
+		return NULL;
+
+	list_for_each_entry(bo, &kgem->next_request->buffers, request)
+		if (bo->handle == reloc->target_handle && bo->proxy == NULL)
+			break;
+
+	assert(&bo->request != &kgem->next_request->buffers);
+
+	return bo;
+}
+
+static int kgem_debug_handle_is_fenced(struct kgem *kgem, uint32_t handle)
+{
+	int i;
+
+	for (i = 0; i < kgem->nexec; i++)
+		if (kgem->exec[i].handle == handle)
+			return kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE;
+
+	return 0;
+}
+
+static int kgem_debug_handle_tiling(struct kgem *kgem, uint32_t handle)
+{
+	struct kgem_bo *bo;
+
+	list_for_each_entry(bo, &kgem->next_request->buffers, request)
+		if (bo->handle == handle)
+			return bo->tiling;
+
+	return 0;
+}
+
+void
+kgem_debug_print(const uint32_t *data,
+		 uint32_t offset, unsigned int index,
+		 char *fmt, ...)
+{
+	va_list va;
+	char buf[240];
+	int len;
+
+	len = snprintf(buf, sizeof(buf),
+		       "0x%08x: 0x%08x: %s",
+		       (offset + index) * 4,
+		       data[index],
+		       index == 0 ? "" : "   ");
+
+	va_start(va, fmt);
+	vsnprintf(buf + len, sizeof(buf) - len, fmt, va);
+	va_end(va);
+
+	ErrorF("%s", buf);
+}
+
+static int
+decode_nop(struct kgem *kgem, uint32_t offset)
+{
+	uint32_t *data = kgem->batch + offset;
+	kgem_debug_print(data, offset, 0, "UNKNOWN\n");
+	assert(0);
+	return 1;
+}
+
+static int
+decode_mi(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int len_mask;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x08, 0, 1, 1, "MI_ARB_ON_OFF" },
+		{ 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" },
+		{ 0x30, 0x3f, 3, 3, "MI_BATCH_BUFFER" },
+		{ 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" },
+		{ 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
+		{ 0x04, 0, 1, 1, "MI_FLUSH" },
+		{ 0x22, 0x1f, 3, 3, "MI_LOAD_REGISTER_IMM" },
+		{ 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
+		{ 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
+		{ 0x00, 0, 1, 1, "MI_NOOP" },
+		{ 0x11, 0x3f, 2, 2, "MI_OVERLAY_FLIP" },
+		{ 0x07, 0, 1, 1, "MI_REPORT_HEAD" },
+		{ 0x18, 0x3f, 2, 2, "MI_SET_CONTEXT" },
+		{ 0x20, 0x3f, 3, 4, "MI_STORE_DATA_IMM" },
+		{ 0x21, 0x3f, 3, 4, "MI_STORE_DATA_INDEX" },
+		{ 0x24, 0x3f, 3, 3, "MI_STORE_REGISTER_MEM" },
+		{ 0x02, 0, 1, 1, "MI_USER_INTERRUPT" },
+		{ 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" },
+		{ 0x16, 0x7f, 3, 3, "MI_SEMAPHORE_MBOX" },
+		{ 0x26, 0x1f, 3, 4, "MI_FLUSH_DW" },
+		{ 0x0b, 0, 1, 1, "MI_SUSPEND_FLUSH" },
+	};
+	uint32_t *data = kgem->batch + offset;
+	int op;
+
+	for (op = 0; op < ARRAY_SIZE(opcodes); op++) {
+		if ((data[0] & 0x1f800000) >> 23 == opcodes[op].opcode) {
+			unsigned int len = 1, i;
+
+			kgem_debug_print(data, offset, 0, "%s\n", opcodes[op].name);
+			if (opcodes[op].max_len > 1) {
+				len = (data[0] & opcodes[op].len_mask) + 2;
+				if (len < opcodes[op].min_len ||
+				    len > opcodes[op].max_len)
+				{
+					ErrorF("Bad length (%d) in %s, [%d, %d]\n",
+					       len, opcodes[op].name,
+					       opcodes[op].min_len,
+					       opcodes[op].max_len);
+					assert(0);
+				}
+			}
+
+			for (i = 1; i < len; i++)
+				kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+			return len;
+		}
+	}
+
+	kgem_debug_print(data, offset, 0, "MI UNKNOWN\n");
+	assert(0);
+	return 1;
+}
+
+static int
+decode_2d(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x40, 5, 5, "COLOR_BLT" },
+		{ 0x43, 6, 6, "SRC_COPY_BLT" },
+		{ 0x01, 8, 8, "XY_SETUP_BLT" },
+		{ 0x11, 9, 9, "XY_SETUP_MONO_PATTERN_SL_BLT" },
+		{ 0x03, 3, 3, "XY_SETUP_CLIP_BLT" },
+		{ 0x24, 2, 2, "XY_PIXEL_BLT" },
+		{ 0x25, 3, 3, "XY_SCANLINES_BLT" },
+		{ 0x26, 4, 4, "Y_TEXT_BLT" },
+		{ 0x31, 5, 134, "XY_TEXT_IMMEDIATE_BLT" },
+		{ 0x50, 6, 6, "XY_COLOR_BLT" },
+		{ 0x51, 6, 6, "XY_PAT_BLT" },
+		{ 0x76, 8, 8, "XY_PAT_CHROMA_BLT" },
+		{ 0x72, 7, 135, "XY_PAT_BLT_IMMEDIATE" },
+		{ 0x77, 9, 137, "XY_PAT_CHROMA_BLT_IMMEDIATE" },
+		{ 0x52, 9, 9, "XY_MONO_PAT_BLT" },
+		{ 0x59, 7, 7, "XY_MONO_PAT_FIXED_BLT" },
+		{ 0x53, 8, 8, "XY_SRC_COPY_BLT" },
+		{ 0x54, 8, 8, "XY_MONO_SRC_COPY_BLT" },
+		{ 0x71, 9, 137, "XY_MONO_SRC_COPY_IMMEDIATE_BLT" },
+		{ 0x55, 9, 9, "XY_FULL_BLT" },
+		{ 0x55, 9, 137, "XY_FULL_IMMEDIATE_PATTERN_BLT" },
+		{ 0x56, 9, 9, "XY_FULL_MONO_SRC_BLT" },
+		{ 0x75, 10, 138, "XY_FULL_MONO_SRC_IMMEDIATE_PATTERN_BLT" },
+		{ 0x57, 12, 12, "XY_FULL_MONO_PATTERN_BLT" },
+		{ 0x58, 12, 12, "XY_FULL_MONO_PATTERN_MONO_SRC_BLT" },
+	};
+
+	unsigned int op, len;
+	const char *format = NULL;
+	uint32_t *data = kgem->batch + offset;
+	struct drm_i915_gem_relocation_entry *reloc;
+
+	/* Special case the two most common ops that we detail in full */
+	switch ((data[0] & 0x1fc00000) >> 22) {
+	case 0x50:
+		kgem_debug_print(data, offset, 0,
+			  "XY_COLOR_BLT (rgb %sabled, alpha %sabled, dst tile %d)\n",
+			  (data[0] & (1 << 20)) ? "en" : "dis",
+			  (data[0] & (1 << 21)) ? "en" : "dis",
+			  (data[0] >> 11) & 1);
+
+		len = (data[0] & 0x000000ff) + 2;
+		assert(len == 6);
+
+		switch ((data[1] >> 24) & 0x3) {
+		case 0:
+			format="8";
+			break;
+		case 1:
+			format="565";
+			break;
+		case 2:
+			format="1555";
+			break;
+		case 3:
+			format="8888";
+			break;
+		}
+
+		kgem_debug_print(data, offset, 1, "format %s, rop %x, pitch %d, "
+			  "clipping %sabled\n", format,
+			  (data[1] >> 16) & 0xff,
+			  (short)(data[1] & 0xffff),
+			  data[1] & (1 << 30) ? "en" : "dis");
+		kgem_debug_print(data, offset, 2, "(%d,%d)\n",
+			  data[2] & 0xffff, data[2] >> 16);
+		kgem_debug_print(data, offset, 3, "(%d,%d)\n",
+			  data[3] & 0xffff, data[3] >> 16);
+		reloc = kgem_debug_get_reloc_entry(kgem, offset+4);
+		kgem_debug_print(data, offset, 4, "dst offset 0x%08x [handle=%d, delta=%d, read=%x, write=%x (fenced? %d, tiling? %d)]\n",
+				 data[4],
+				 reloc->target_handle, reloc->delta,
+				 reloc->read_domains, reloc->write_domain,
+				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
+				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
+		kgem_debug_print(data, offset, 5, "color\n");
+		assert(kgem->gen >= 40 ||
+		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
+		return len;
+
+	case 0x53:
+		kgem_debug_print(data, offset, 0,
+			  "XY_SRC_COPY_BLT (rgb %sabled, alpha %sabled, "
+			  "src tile %d, dst tile %d)\n",
+			  (data[0] & (1 << 20)) ? "en" : "dis",
+			  (data[0] & (1 << 21)) ? "en" : "dis",
+			  (data[0] >> 15) & 1,
+			  (data[0] >> 11) & 1);
+
+		len = (data[0] & 0x000000ff) + 2;
+		assert(len == 8);
+
+		switch ((data[1] >> 24) & 0x3) {
+		case 0:
+			format="8";
+			break;
+		case 1:
+			format="565";
+			break;
+		case 2:
+			format="1555";
+			break;
+		case 3:
+			format="8888";
+			break;
+		}
+
+		kgem_debug_print(data, offset, 1, "format %s, rop %x, dst pitch %d, "
+				 "clipping %sabled\n", format,
+				 (data[1] >> 16) & 0xff,
+				 (short)(data[1] & 0xffff),
+				 data[1] & (1 << 30) ? "en" : "dis");
+		kgem_debug_print(data, offset, 2, "dst (%d,%d)\n",
+				 data[2] & 0xffff, data[2] >> 16);
+		kgem_debug_print(data, offset, 3, "dst (%d,%d)\n",
+				 data[3] & 0xffff, data[3] >> 16);
+		reloc = kgem_debug_get_reloc_entry(kgem, offset+4);
+		assert(reloc);
+		kgem_debug_print(data, offset, 4, "dst offset 0x%08x [handle=%d, delta=%d, read=%x, write=%x, (fenced? %d, tiling? %d)]\n",
+				 data[4],
+				 reloc->target_handle, reloc->delta,
+				 reloc->read_domains, reloc->write_domain,
+				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
+				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
+		assert(kgem->gen >= 40 ||
+		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
+
+		kgem_debug_print(data, offset, 5, "src (%d,%d)\n",
+				 data[5] & 0xffff, data[5] >> 16);
+		kgem_debug_print(data, offset, 6, "src pitch %d\n",
+				 (short)(data[6] & 0xffff));
+		reloc = kgem_debug_get_reloc_entry(kgem, offset+7);
+		assert(reloc);
+		kgem_debug_print(data, offset, 7, "src offset 0x%08x [handle=%d, delta=%d, read=%x, write=%x (fenced? %d, tiling? %d)]\n",
+				 data[7],
+				 reloc->target_handle, reloc->delta,
+				 reloc->read_domains, reloc->write_domain,
+				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
+				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
+		assert(kgem->gen >= 40 ||
+		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
+
+		return len;
+	}
+
+	for (op = 0; op < ARRAY_SIZE(opcodes); op++) {
+		if ((data[0] & 0x1fc00000) >> 22 == opcodes[op].opcode) {
+			unsigned int i;
+
+			len = 1;
+			kgem_debug_print(data, offset, 0, "%s\n", opcodes[op].name);
+			if (opcodes[op].max_len > 1) {
+				len = (data[0] & 0x000000ff) + 2;
+				assert(len >= opcodes[op].min_len &&
+				       len <= opcodes[op].max_len);
+			}
+
+			for (i = 1; i < len; i++)
+				kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+			return len;
+		}
+	}
+
+	kgem_debug_print(data, offset, 0, "2D UNKNOWN\n");
+	assert(0);
+	return 1;
+}
+
+static int (*decode_3d(int gen))(struct kgem*, uint32_t)
+{
+	if (gen >= 80) {
+	} else if (gen >= 70) {
+		return kgem_gen7_decode_3d;
+	} else if (gen >= 60) {
+		return kgem_gen6_decode_3d;
+	} else if (gen >= 50) {
+		return kgem_gen5_decode_3d;
+	} else if (gen >= 40) {
+		return kgem_gen4_decode_3d;
+	} else if (gen >= 30) {
+		return kgem_gen3_decode_3d;
+	} else if (gen >= 20) {
+		return kgem_gen2_decode_3d;
+	}
+	assert(0);
+}
+
+static void (*finish_state(int gen))(struct kgem*)
+{
+	if (gen >= 80) {
+	} else if (gen >= 70) {
+		return kgem_gen7_finish_state;
+	} else if (gen >= 60) {
+		return kgem_gen6_finish_state;
+	} else if (gen >= 50) {
+		return kgem_gen5_finish_state;
+	} else if (gen >= 40) {
+		return kgem_gen4_finish_state;
+	} else if (gen >= 30) {
+		return kgem_gen3_finish_state;
+	} else if (gen >= 20) {
+		return kgem_gen2_finish_state;
+	}
+	assert(0);
+}
+
+void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch)
+{
+	int (*const decode[])(struct kgem *, uint32_t) = {
+		decode_mi,
+		decode_nop,
+		decode_2d,
+		decode_3d(kgem->gen),
+	};
+	uint32_t offset = 0;
+
+	while (offset < nbatch) {
+		int class = (kgem->batch[offset] & 0xe0000000) >> 29;
+		assert(class < ARRAY_SIZE(decode));
+		offset += decode[class](kgem, offset);
+	}
+
+	finish_state(kgem->gen)(kgem);
+}
diff --git a/cogl/driver/drm/kgem_debug.h b/cogl/driver/drm/kgem_debug.h
new file mode 100644
index 00000000..82d6f666
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug.h
@@ -0,0 +1,34 @@
+#ifndef KGEM_DEBUG_H
+#define KGEM_DEBUG_H
+
+void
+kgem_debug_print(const uint32_t *data,
+		 uint32_t offset, unsigned int index,
+		 char *fmt, ...);
+
+struct drm_i915_gem_relocation_entry *
+kgem_debug_get_reloc_entry(struct kgem *kgem, uint32_t offset);
+
+struct kgem_bo *
+kgem_debug_get_bo_for_reloc_entry(struct kgem *kgem,
+				  struct drm_i915_gem_relocation_entry *reloc);
+
+int kgem_gen7_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen7_finish_state(struct kgem *kgem);
+
+int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen6_finish_state(struct kgem *kgem);
+
+int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen5_finish_state(struct kgem *kgem);
+
+int kgem_gen4_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen4_finish_state(struct kgem *kgem);
+
+int kgem_gen3_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen3_finish_state(struct kgem *kgem);
+
+int kgem_gen2_decode_3d(struct kgem *kgem, uint32_t offset);
+void kgem_gen2_finish_state(struct kgem *kgem);
+
+#endif
diff --git a/cogl/driver/drm/kgem_debug_gen2.c b/cogl/driver/drm/kgem_debug_gen2.c
new file mode 100644
index 00000000..09f3873b
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen2.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include "gen2_render.h"
+
+#include "kgem_debug.h"
+
+static struct state {
+	int vertex_format;
+} state;
+
+static inline float int_as_float(uint32_t dw)
+{
+	union {
+		float f;
+		uint32_t dw;
+	} u;
+	u.dw = dw;
+	return u.f;
+}
+
+static int
+decode_3d_primitive(struct kgem *kgem, uint32_t offset)
+{
+    uint32_t *data = kgem->batch + offset;
+    char immediate = (data[0] & (1 << 23)) == 0;
+    unsigned int len;
+    const char *primtype;
+
+    switch ((data[0] >> 18) & 0xf) {
+    case 0x0: primtype = "TRILIST"; break;
+    case 0x1: primtype = "TRISTRIP"; break;
+    case 0x2: primtype = "TRISTRIP_REVERSE"; break;
+    case 0x3: primtype = "TRIFAN"; break;
+    case 0x4: primtype = "POLYGON"; break;
+    case 0x5: primtype = "LINELIST"; break;
+    case 0x6: primtype = "LINESTRIP"; break;
+    case 0x7: primtype = "RECTLIST"; break;
+    case 0x8: primtype = "POINTLIST"; break;
+    case 0x9: primtype = "DIB"; break;
+    case 0xa: primtype = "CLEAR_RECT"; break;
+    default: primtype = "unknown"; break;
+    }
+
+    /* XXX: 3DPRIM_DIB not supported */
+    if (immediate) {
+	len = (data[0] & 0x0003ffff) + 2;
+	kgem_debug_print(data, offset, 0, "3DPRIMITIVE inline %s\n", primtype);
+#if 0
+	if (!saved_s2_set || !saved_s4_set) {
+	    fprintf(out, "unknown vertex format\n");
+	    for (i = 1; i < len; i++) {
+		kgem_debug_print(data, offset, i,
+			  "           vertex data (%f float)\n",
+			  int_as_float(data[i]));
+	    }
+	} else {
+	    unsigned int vertex = 0;
+	    for (i = 1; i < len;) {
+		unsigned int tc;
+
+#define VERTEX_OUT(fmt, ...) do {					\
+    if (i < len)							\
+	kgem_debug_print(data, offset, i, " V%d."fmt"\n", vertex, __VA_ARGS__); \
+    else								\
+	fprintf(out, " missing data in V%d\n", vertex);			\
+    i++;								\
+} while (0)
+
+		VERTEX_OUT("X = %f", int_as_float(data[i]));
+		VERTEX_OUT("Y = %f", int_as_float(data[i]));
+	        switch (saved_s4 >> 6 & 0x7) {
+		case 0x1:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    break;
+		case 0x2:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		case 0x3:
+		    break;
+		case 0x4:
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		default:
+		    fprintf(out, "bad S4 position mask\n");
+		}
+
+		if (saved_s4 & (1 << 10)) {
+		    VERTEX_OUT("color = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 11)) {
+		    VERTEX_OUT("spec = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 12))
+		    VERTEX_OUT("width = 0x%08x)", data[i]);
+
+		for (tc = 0; tc <= 7; tc++) {
+		    switch ((saved_s2 >> (tc * 4)) & 0xf) {
+		    case 0x0:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x1:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x2:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.W = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x3:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x4:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0x5:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			VERTEX_OUT("T%d.ZW = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0xf:
+			break;
+		    default:
+			fprintf(out, "bad S2.T%d format\n", tc);
+		    }
+		}
+		vertex++;
+	    }
+	}
+#endif
+    } else {
+	/* indirect vertices */
+	len = data[0] & 0x0000ffff; /* index count */
+#if 0
+	if (data[0] & (1 << 17)) {
+	    /* random vertex access */
+	    kgem_debug_print(data, offset, 0,
+		      "3DPRIMITIVE random indirect %s (%d)\n", primtype, len);
+	    if (len == 0) {
+		/* vertex indices continue until 0xffff is found */
+		for (i = 1; i < count; i++) {
+		    if ((data[i] & 0xffff) == 0xffff) {
+			kgem_debug_print(data, offset, i,
+				  "    indices: (terminator)\n");
+			ret = i;
+			goto out;
+		    } else if ((data[i] >> 16) == 0xffff) {
+			kgem_debug_print(data, offset, i,
+				  "    indices: 0x%04x, (terminator)\n",
+				  data[i] & 0xffff);
+			ret = i;
+			goto out;
+		    } else {
+			kgem_debug_print(data, offset, i,
+				  "    indices: 0x%04x, 0x%04x\n",
+				  data[i] & 0xffff, data[i] >> 16);
+		    }
+		}
+		fprintf(out,
+			"3DPRIMITIVE: no terminator found in index buffer\n");
+		ret = count;
+		goto out;
+	    } else {
+		/* fixed size vertex index buffer */
+		for (j = 1, i = 0; i < len; i += 2, j++) {
+		    if (i * 2 == len - 1) {
+			kgem_debug_print(data, offset, j,
+				  "    indices: 0x%04x\n",
+				  data[j] & 0xffff);
+		    } else {
+			kgem_debug_print(data, offset, j,
+				  "    indices: 0x%04x, 0x%04x\n",
+				  data[j] & 0xffff, data[j] >> 16);
+		    }
+		}
+	    }
+	    ret = (len + 1) / 2 + 1;
+	    goto out;
+	} else {
+	    /* sequential vertex access */
+	    kgem_debug_print(data, offset, 0,
+		      "3DPRIMITIVE sequential indirect %s, %d starting from "
+		      "%d\n", primtype, len, data[1] & 0xffff);
+	    kgem_debug_print(data, offset, 1, "           start\n");
+	    ret = 2;
+	    goto out;
+	}
+#endif
+    }
+
+    return len;
+}
+
+static int
+decode_3d_1d(struct kgem *kgem, uint32_t offset)
+{
+    uint32_t *data = kgem->batch + offset;
+    unsigned int len, i, idx, word, map;
+    const char *format, *zformat, *type;
+    uint32_t opcode;
+
+    static const struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	const char *name;
+    } opcodes_3d_1d[] = {
+	{ 0x86, 4, 4, "3DSTATE_CHROMA_KEY" },
+	{ 0x88, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" },
+	{ 0x99, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" },
+	{ 0x9a, 2, 2, "3DSTATE_DEFAULT_SPECULAR" },
+	{ 0x98, 2, 2, "3DSTATE_DEFAULT_Z" },
+	{ 0x97, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" },
+	{ 0x9d, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" },
+	{ 0x9e, 4, 4, "3DSTATE_MONO_FILTER" },
+	{ 0x89, 4, 4, "3DSTATE_FOG_MODE" },
+	{ 0x8f, 2, 16, "3DSTATE_MAP_PALLETE_LOAD_32" },
+	{ 0x83, 2, 2, "3DSTATE_SPAN_STIPPLE" },
+	{ 0x8c, 2, 2, "3DSTATE_MAP_COORD_TRANSFORM" },
+	{ 0x8b, 2, 2, "3DSTATE_MAP_VERTEX_TRANSFORM" },
+	{ 0x8d, 3, 3, "3DSTATE_W_STATE" },
+	{ 0x01, 2, 2, "3DSTATE_COLOR_FACTOR" },
+	{ 0x02, 2, 2, "3DSTATE_MAP_COORD_SETBIND" },
+    }, *opcode_3d_1d;
+
+    opcode = (data[0] & 0x00ff0000) >> 16;
+
+    switch (opcode) {
+    case 0x07:
+	/* This instruction is unusual.  A 0 length means just 1 DWORD instead of
+	 * 2.  The 0 length is specified in one place to be unsupported, but
+	 * stated to be required in another, and 0 length LOAD_INDIRECTs appear
+	 * to cause no harm at least.
+	 */
+	kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_INDIRECT\n");
+	len = (data[0] & 0x000000ff) + 1;
+	i = 1;
+	if (data[0] & (0x01 << 8)) {
+	    kgem_debug_print(data, offset, i++, "SIS.0\n");
+	    kgem_debug_print(data, offset, i++, "SIS.1\n");
+	}
+	if (data[0] & (0x02 << 8)) {
+	    kgem_debug_print(data, offset, i++, "DIS.0\n");
+	}
+	if (data[0] & (0x04 << 8)) {
+	    kgem_debug_print(data, offset, i++, "SSB.0\n");
+	    kgem_debug_print(data, offset, i++, "SSB.1\n");
+	}
+	if (data[0] & (0x08 << 8)) {
+	    kgem_debug_print(data, offset, i++, "MSB.0\n");
+	    kgem_debug_print(data, offset, i++, "MSB.1\n");
+	}
+	if (data[0] & (0x10 << 8)) {
+	    kgem_debug_print(data, offset, i++, "PSP.0\n");
+	    kgem_debug_print(data, offset, i++, "PSP.1\n");
+	}
+	if (data[0] & (0x20 << 8)) {
+	    kgem_debug_print(data, offset, i++, "PSC.0\n");
+	    kgem_debug_print(data, offset, i++, "PSC.1\n");
+	}
+	assert(len == i);
+	return len;
+    case 0x04:
+	kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 0; word <= 8; word++) {
+	    if (data[0] & (1 << (4 + word))) {
+		kgem_debug_print(data, offset, i, "S%d: 0x%08x\n", i, data[i]);
+		i++;
+	    }
+	}
+	assert (len ==i);
+	return len;
+    case 0x03:
+	kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_2\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 6; word <= 14; word++) {
+	    if (data[0] & (1 << word)) {
+		if (word == 6)
+		    kgem_debug_print(data, offset, i++, "TBCF\n");
+		else if (word >= 7 && word <= 10) {
+		    kgem_debug_print(data, offset, i++, "TB%dC\n", word - 7);
+		    kgem_debug_print(data, offset, i++, "TB%dA\n", word - 7);
+		} else if (word >= 11 && word <= 14) {
+		    kgem_debug_print(data, offset, i, "TM%dS0: offset=0x%08x, %s\n",
+			      word - 11,
+			      data[i]&0xfffffffe,
+			      data[i]&1?"use fence":"");
+		    i++;
+		    kgem_debug_print(data, offset, i, "TM%dS1: height=%i, width=%i, %s\n",
+			      word - 11,
+			      data[i]>>21, (data[i]>>10)&0x3ff,
+			      data[i]&2?(data[i]&1?"y-tiled":"x-tiled"):"");
+		    i++;
+		    kgem_debug_print(data, offset, i, "TM%dS2: pitch=%i, \n",
+			      word - 11,
+			      ((data[i]>>21) + 1)*4);
+		    i++;
+		    kgem_debug_print(data, offset, i++, "TM%dS3\n", word - 11);
+		    kgem_debug_print(data, offset, i++, "TM%dS4: dflt color\n", word - 11);
+		}
+	    }
+	}
+	assert (len == i);
+	return len;
+    case 0x00:
+	kgem_debug_print(data, offset, 0, "3DSTATE_MAP_STATE\n");
+	len = (data[0] & 0x0000003f) + 2;
+	kgem_debug_print(data, offset, 1, "mask\n");
+
+	i = 2;
+	for (map = 0; map <= 15; map++) {
+	    if (data[1] & (1 << map)) {
+		int width, height, pitch, dword;
+		const char *tiling;
+
+		dword = data[i];
+		kgem_debug_print(data, offset, i++, "map %d MS2 %s%s%s\n", map,
+			  dword&(1<<31)?"untrusted surface, ":"",
+			  dword&(1<<1)?"vertical line stride enable, ":"",
+			  dword&(1<<0)?"vertical ofs enable, ":"");
+
+		dword = data[i];
+		width = ((dword >> 10) & ((1 << 11) - 1))+1;
+		height = ((dword >> 21) & ((1 << 11) - 1))+1;
+
+		tiling = "none";
+		if (dword & (1 << 2))
+			tiling = "fenced";
+		else if (dword & (1 << 1))
+			tiling = dword & (1 << 0) ? "Y" : "X";
+		type = " BAD";
+		format = "BAD";
+		switch ((dword>>7) & 0x7) {
+		case 1:
+		    type = "8b";
+		    switch ((dword>>3) & 0xf) {
+		    case 0: format = "I"; break;
+		    case 1: format = "L"; break;
+		    case 2: format = "A"; break;
+		    case 3: format = " mono"; break; }
+		    break;
+		case 2:
+		    type = "16b";
+		    switch ((dword>>3) & 0xf) {
+		    case 0: format = " rgb565"; break;
+		    case 1: format = " argb1555"; break;
+		    case 2: format = " argb4444"; break;
+		    case 5: format = " ay88"; break;
+		    case 6: format = " bump655"; break;
+		    case 7: format = "I"; break;
+		    case 8: format = "L"; break;
+		    case 9: format = "A"; break; }
+		    break;
+		case 3:
+		    type = "32b";
+		    switch ((dword>>3) & 0xf) {
+		    case 0: format = " argb8888"; break;
+		    case 1: format = " abgr8888"; break;
+		    case 2: format = " xrgb8888"; break;
+		    case 3: format = " xbgr8888"; break;
+		    case 4: format = " qwvu8888"; break;
+		    case 5: format = " axvu8888"; break;
+		    case 6: format = " lxvu8888"; break;
+		    case 7: format = " xlvu8888"; break;
+		    case 8: format = " argb2101010"; break;
+		    case 9: format = " abgr2101010"; break;
+		    case 10: format = " awvu2101010"; break;
+		    case 11: format = " gr1616"; break;
+		    case 12: format = " vu1616"; break;
+		    case 13: format = " xI824"; break;
+		    case 14: format = " xA824"; break;
+		    case 15: format = " xL824"; break; }
+		    break;
+		case 5:
+		    type = "422";
+		    switch ((dword>>3) & 0xf) {
+		    case 0: format = " yuv_swapy"; break;
+		    case 1: format = " yuv"; break;
+		    case 2: format = " yuv_swapuv"; break;
+		    case 3: format = " yuv_swapuvy"; break; }
+		    break;
+		case 6:
+		    type = "compressed";
+		    switch ((dword>>3) & 0x7) {
+		    case 0: format = " dxt1"; break;
+		    case 1: format = " dxt2_3"; break;
+		    case 2: format = " dxt4_5"; break;
+		    case 3: format = " fxt1"; break;
+		    case 4: format = " dxt1_rb"; break; }
+		    break;
+		case 7:
+		    type = "4b indexed";
+		    switch ((dword>>3) & 0xf) {
+		    case 7: format = " argb8888"; break; }
+		    break;
+		}
+		dword = data[i];
+		kgem_debug_print(data, offset, i++, "map %d MS3 [width=%d, height=%d, format=%s%s, tiling=%s%s]\n",
+			  map, width, height, type, format, tiling,
+			  dword&(1<<9)?" palette select":"");
+
+		dword = data[i];
+		pitch = 4*(((dword >> 21) & ((1 << 11) - 1))+1);
+		kgem_debug_print(data, offset, i++, "map %d MS4 [pitch=%d, max_lod=%i, vol_depth=%i, cube_face_ena=%x, %s]\n",
+			  map, pitch,
+			  (dword>>9)&0x3f, dword&0xff, (dword>>15)&0x3f,
+			  dword&(1<<8)?"miplayout legacy":"miplayout right");
+	    }
+	}
+	assert (len == i);
+	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+	assert (len == 2);
+	kgem_debug_print(data, offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	switch ((data[1] >> 2) & 0x3) {
+	case 0x0: zformat = "u16"; break;
+	case 0x1: zformat = "f16"; break;
+	case 0x2: zformat = "u24x8"; break;
+	default: zformat = "BAD"; break;
+	}
+	kgem_debug_print(data, offset, 1, "%s format, %s depth format, early Z %sabled\n",
+		  format, zformat,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
+
+    case 0x8e:
+	{
+	    const char *name, *tiling;
+
+	    len = (data[0] & 0x0000000f) + 2;
+	    assert (len == 3);
+
+	    switch((data[1] >> 24) & 0x7) {
+	    case 0x3: name = "color"; break;
+	    case 0x7: name = "depth"; break;
+	    default: name = "unknown"; break;
+	    }
+
+	    tiling = "none";
+	    if (data[1] & (1 << 23))
+		tiling = "fenced";
+	    else if (data[1] & (1 << 22))
+		tiling = data[1] & (1 << 21) ? "Y" : "X";
+
+	    kgem_debug_print(data, offset, 0, "3DSTATE_BUFFER_INFO\n");
+	    kgem_debug_print(data, offset, 1, "%s, tiling = %s, pitch=%d\n", name, tiling, data[1]&0xffff);
+
+	    kgem_debug_print(data, offset, 2, "address\n");
+	    return len;
+	}
+
+    case 0x81:
+	len = (data[0] & 0x0000000f) + 2;
+	assert (len == 3);
+
+	kgem_debug_print(data, offset, 0,
+		  "3DSTATE_SCISSOR_RECTANGLE\n");
+	kgem_debug_print(data, offset, 1, "(%d,%d)\n",
+		  data[1] & 0xffff, data[1] >> 16);
+	kgem_debug_print(data, offset, 2, "(%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	return len;
+
+    case 0x80:
+	len = (data[0] & 0x0000000f) + 2;
+	assert (len == 5);
+
+	kgem_debug_print(data, offset, 0,
+		  "3DSTATE_DRAWING_RECTANGLE\n");
+	kgem_debug_print(data, offset, 1, "%s\n",
+		  data[1]&(1<<30)?"depth ofs disabled ":"");
+	kgem_debug_print(data, offset, 2, "(%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	kgem_debug_print(data, offset, 3, "(%d,%d)\n",
+		  data[3] & 0xffff, data[3] >> 16);
+	kgem_debug_print(data, offset, 4, "(%d,%d)\n",
+		  data[4] & 0xffff, data[4] >> 16);
+	return len;
+
+    case 0x9c:
+	len = (data[0] & 0x0000000f) + 2;
+	assert (len == 7);
+
+	kgem_debug_print(data, offset, 0,
+		  "3DSTATE_CLEAR_PARAMETERS\n");
+	kgem_debug_print(data, offset, 1, "prim_type=%s, clear=%s%s%s\n",
+		  data[1]&(1<<16)?"CLEAR_RECT":"ZONE_INIT",
+		  data[1]&(1<<2)?"color,":"",
+		  data[1]&(1<<1)?"depth,":"",
+		  data[1]&(1<<0)?"stencil,":"");
+	kgem_debug_print(data, offset, 2, "clear color\n");
+	kgem_debug_print(data, offset, 3, "clear depth/stencil\n");
+	kgem_debug_print(data, offset, 4, "color value (rgba8888)\n");
+	kgem_debug_print(data, offset, 5, "depth value %f\n",
+		  int_as_float(data[5]));
+	kgem_debug_print(data, offset, 6, "clear stencil\n");
+	return len;
+    }
+
+    for (idx = 0; idx < ARRAY_SIZE(opcodes_3d_1d); idx++) {
+	opcode_3d_1d = &opcodes_3d_1d[idx];
+	if (((data[0] & 0x00ff0000) >> 16) == opcode_3d_1d->opcode) {
+	    len = 1;
+
+	    kgem_debug_print(data, offset, 0, "%s\n", opcode_3d_1d->name);
+	    if (opcode_3d_1d->max_len > 1) {
+		len = (data[0] & 0x0000ffff) + 2;
+		assert (len >= opcode_3d_1d->min_len &&
+			len <= opcode_3d_1d->max_len);
+	    }
+
+	    for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+	    return len;
+	}
+    }
+
+    kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d_1d opcode = 0x%x\n", opcode);
+    return 1;
+}
+
+static int
+decode_3d_1c(struct kgem *kgem, uint32_t offset)
+{
+    uint32_t *data = kgem->batch + offset;
+    uint32_t opcode;
+
+    opcode = (data[0] & 0x00f80000) >> 19;
+
+    switch (opcode) {
+    case 0x11:
+	kgem_debug_print(data, offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE\n");
+	return 1;
+    case 0x10:
+	kgem_debug_print(data, offset, 0, "3DSTATE_SCISSOR_ENABLE %s\n",
+		data[0]&1?"enabled":"disabled");
+	return 1;
+    case 0x01:
+	kgem_debug_print(data, offset, 0, "3DSTATE_MAP_COORD_SET_I830\n");
+	return 1;
+    case 0x0a:
+	kgem_debug_print(data, offset, 0, "3DSTATE_MAP_CUBE_I830\n");
+	return 1;
+    case 0x05:
+	kgem_debug_print(data, offset, 0, "3DSTATE_MAP_TEX_STREAM_I830\n");
+	return 1;
+    }
+
+    kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d_1c opcode = 0x%x\n",
+	      opcode);
+    return 1;
+}
+
+int kgem_gen2_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+    const static struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	const char *name;
+    } opcodes[] = {
+	{ 0x02, 1, 1, "3DSTATE_MODES_3" },
+	{ 0x03, 1, 1, "3DSTATE_ENABLES_1"},
+	{ 0x04, 1, 1, "3DSTATE_ENABLES_2"},
+	{ 0x05, 1, 1, "3DSTATE_VFT0"},
+	{ 0x06, 1, 1, "3DSTATE_AA"},
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+	{ 0x08, 1, 1, "3DSTATE_MODES_1" },
+	{ 0x09, 1, 1, "3DSTATE_STENCIL_TEST" },
+	{ 0x0a, 1, 1, "3DSTATE_VFT1"},
+	{ 0x0b, 1, 1, "3DSTATE_INDPT_ALPHA_BLEND" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x0d, 1, 1, "3DSTATE_MAP_BLEND_OP" },
+	{ 0x0e, 1, 1, "3DSTATE_MAP_BLEND_ARG" },
+	{ 0x0f, 1, 1, "3DSTATE_MODES_2" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x16, 1, 1, "3DSTATE_MODES_4" },
+    };
+    uint32_t *data = kgem->batch + offset;
+    uint32_t opcode = (data[0] & 0x1f000000) >> 24;
+    uint32_t idx;
+
+    switch (opcode) {
+    case 0x1f:
+	return decode_3d_primitive(kgem, offset);
+    case 0x1d:
+	return decode_3d_1d(kgem, offset);
+    case 0x1c:
+	return decode_3d_1c(kgem, offset);
+    }
+
+    /* Catch the known instructions */
+    for (idx = 0; idx < ARRAY_SIZE(opcodes); idx++) {
+	if (opcode == opcodes[idx].opcode) {
+	    unsigned int len = 1, i;
+
+	    kgem_debug_print(data, offset, 0, "%s\n", opcodes[idx].name);
+	    if (opcodes[idx].max_len > 1) {
+		len = (data[0] & 0xf) + 2;
+		assert(len >= opcodes[idx].min_len &&
+		       len <= opcodes[idx].max_len);
+	    }
+
+	    for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+	    return len;
+	}
+    }
+
+    kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d opcode = 0x%x\n", opcode);
+    return 1;
+}
+
+void kgem_gen2_finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
diff --git a/cogl/driver/drm/kgem_debug_gen3.c b/cogl/driver/drm/kgem_debug_gen3.c
new file mode 100644
index 00000000..1634225c
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen3.c
@@ -0,0 +1,1600 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include "gen3_render.h"
+
+#include "kgem_debug.h"
+
+enum type {
+	T_FLOAT32,
+	T_FLOAT16,
+};
+
+static struct state {
+	struct vertex_buffer {
+		int handle;
+		void *base;
+		const char *ptr;
+		int pitch;
+
+		struct kgem_bo *current;
+	} vb;
+	struct vertex_elements {
+		int offset;
+		bool valid;
+		enum type type;
+		int size;
+		uint8_t swizzle[4];
+	} ve[33];
+	int num_ve;
+} state;
+
+static float int_as_float(int i)
+{
+	union {
+		float f;
+		int i;
+	} x;
+	x.i = i;
+	return x.f;
+}
+
+static void gen3_update_vertex_buffer_addr(struct kgem *kgem,
+					   uint32_t offset)
+{
+	uint32_t handle;
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i;
+
+	offset *= sizeof(uint32_t);
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == offset)
+			break;
+	assert(i < kgem->nreloc);
+	handle = kgem->reloc[i].target_handle;
+
+	if (handle == 0) {
+		base = kgem->batch;
+	} else {
+		list_for_each_entry(bo, &kgem->next_request->buffers, request)
+			if (bo->handle == handle)
+				break;
+		assert(&bo->request != &kgem->next_request->buffers);
+		base = kgem_bo_map__debug(kgem, bo);
+	}
+	ptr = (char *)base + kgem->reloc[i].delta;
+
+	state.vb.current = bo;
+	state.vb.base = base;
+	state.vb.ptr = ptr;
+}
+
+static void gen3_update_vertex_buffer_pitch(struct kgem *kgem,
+					   uint32_t offset)
+{
+	state.vb.pitch = kgem->batch[offset] >> 16 & 0x3f;
+	state.vb.pitch *= sizeof(uint32_t);
+}
+
+static void gen3_update_vertex_elements(struct kgem *kgem, uint32_t data)
+{
+	state.ve[1].valid = 1;
+
+	switch ((data >> 6) & 7) {
+	case 1:
+		state.ve[1].type = T_FLOAT32;
+		state.ve[1].size = 3;
+		state.ve[1].swizzle[0] = 1;
+		state.ve[1].swizzle[1] = 1;
+		state.ve[1].swizzle[2] = 1;
+		state.ve[1].swizzle[3] = 3;
+		break;
+	case 2:
+		state.ve[1].type = T_FLOAT32;
+		state.ve[1].size = 4;
+		state.ve[1].swizzle[0] = 1;
+		state.ve[1].swizzle[1] = 1;
+		state.ve[1].swizzle[2] = 1;
+		state.ve[1].swizzle[3] = 1;
+		break;
+	case 3:
+		state.ve[1].type = T_FLOAT32;
+		state.ve[1].size = 2;
+		state.ve[1].swizzle[0] = 1;
+		state.ve[1].swizzle[1] = 1;
+		state.ve[1].swizzle[2] = 2;
+		state.ve[1].swizzle[3] = 3;
+		break;
+	case 4:
+		state.ve[1].type = T_FLOAT32;
+		state.ve[1].size = 3;
+		state.ve[1].swizzle[0] = 1;
+		state.ve[1].swizzle[1] = 1;
+		state.ve[1].swizzle[2] = 3;
+		state.ve[1].swizzle[3] = 1;
+		break;
+	}
+
+	state.ve[2].valid = 0;
+	state.ve[3].valid = 0;
+}
+
+static void gen3_update_vertex_texcoords(struct kgem *kgem, uint32_t data)
+{
+	int id;
+	for (id = 0; id < 8; id++) {
+		uint32_t fmt = (data >> (id*4)) & 0xf;
+		int width;
+
+		state.ve[id+4].valid = fmt != 0xf;
+
+		width = 0;
+		switch (fmt) {
+		case 0:
+			state.ve[id+4].type = T_FLOAT32;
+			width = state.ve[id+4].size = 2;
+			break;
+		case 1:
+			state.ve[id+4].type = T_FLOAT32;
+			width = state.ve[id+4].size = 3;
+			break;
+		case 2:
+			state.ve[id+4].type = T_FLOAT32;
+			width = state.ve[id+4].size = 4;
+			break;
+		case 3:
+			state.ve[id+4].type = T_FLOAT32;
+			width = state.ve[id+4].size = 1;
+			break;
+		case 4:
+			state.ve[id+4].type = T_FLOAT16;
+			width = state.ve[id+4].size = 2;
+			break;
+		case 5:
+			state.ve[id+4].type = T_FLOAT16;
+			width = state.ve[id+4].size = 4;
+			break;
+		}
+
+		state.ve[id+4].swizzle[0] = width > 0 ? 1 : 2;
+		state.ve[id+4].swizzle[1] = width > 1 ? 1 : 2;
+		state.ve[id+4].swizzle[2] = width > 2 ? 1 : 2;
+		state.ve[id+4].swizzle[3] = width > 3 ? 1 : 2;
+	}
+}
+
+static void gen3_update_vertex_elements_offsets(struct kgem *kgem)
+{
+	int i, offset;
+
+	for (i = offset = 0; i < ARRAY_SIZE(state.ve); i++) {
+		if (!state.ve[i].valid)
+			continue;
+
+		state.ve[i].offset = offset;
+		offset += 4 * state.ve[i].size;
+		state.num_ve = i;
+	}
+}
+
+static void vertices_float32_out(const struct vertex_elements *ve, const float *f, int max)
+{
+	int c;
+
+	ErrorF("(");
+	for (c = 0; c < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%f", f[c]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < max-1)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void ve_out(const struct vertex_elements *ve, const void *ptr)
+{
+	switch (ve->type) {
+	case T_FLOAT32:
+		vertices_float32_out(ve, ptr, ve->size);
+		break;
+	case T_FLOAT16:
+		//vertices_float16_out(ve, ptr, ve->size);
+		break;
+	}
+}
+
+static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
+{
+	const struct vertex_buffer *vb = &state.vb;
+	int i = 1;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
+
+		if (!ve->valid)
+			continue;
+
+		ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+}
+
+static int inline_vertex_out(struct kgem *kgem, void *base)
+{
+	const struct vertex_buffer *vb = &state.vb;
+	int i = 1;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const void *ptr = (char *)base + ve->offset;
+
+		if (!ve->valid)
+			continue;
+
+		ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+
+	return vb->pitch;
+}
+
+static int
+gen3_decode_3d_1c(struct kgem *kgem, uint32_t offset)
+{
+	uint32_t *data = kgem->batch + offset;
+	uint32_t opcode;
+
+	opcode = (data[0] & 0x00f80000) >> 19;
+
+	switch (opcode) {
+	case 0x11:
+		kgem_debug_print(data, offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE\n");
+		return 1;
+	case 0x10:
+		kgem_debug_print(data, offset, 0, "3DSTATE_SCISSOR_ENABLE %s\n",
+			  data[0]&1?"enabled":"disabled");
+		return 1;
+	case 0x01:
+		kgem_debug_print(data, offset, 0, "3DSTATE_MAP_COORD_SET_I830\n");
+		return 1;
+	case 0x0a:
+		kgem_debug_print(data, offset, 0, "3DSTATE_MAP_CUBE_I830\n");
+		return 1;
+	case 0x05:
+		kgem_debug_print(data, offset, 0, "3DSTATE_MAP_TEX_STREAM_I830\n");
+		return 1;
+	}
+
+	kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d_1c opcode = 0x%x\n",
+		  opcode);
+	assert(0);
+	return 1;
+}
+
+/** Sets the string dstname to describe the destination of the PS instruction */
+static void
+gen3_get_instruction_dst(uint32_t *data, int i, char *dstname, int do_mask)
+{
+    uint32_t a0 = data[i];
+    int dst_nr = (a0 >> 14) & 0xf;
+    char dstmask[8];
+    const char *sat;
+
+    if (do_mask) {
+	if (((a0 >> 10) & 0xf) == 0xf) {
+	    dstmask[0] = 0;
+	} else {
+	    int dstmask_index = 0;
+
+	    dstmask[dstmask_index++] = '.';
+	    if (a0 & (1 << 10))
+		dstmask[dstmask_index++] = 'x';
+	    if (a0 & (1 << 11))
+		dstmask[dstmask_index++] = 'y';
+	    if (a0 & (1 << 12))
+		dstmask[dstmask_index++] = 'z';
+	    if (a0 & (1 << 13))
+		dstmask[dstmask_index++] = 'w';
+	    dstmask[dstmask_index++] = 0;
+	}
+
+	if (a0 & (1 << 22))
+	    sat = ".sat";
+	else
+	    sat = "";
+    } else {
+	dstmask[0] = 0;
+	sat = "";
+    }
+
+    switch ((a0 >> 19) & 0x7) {
+    case 0:
+	    assert(dst_nr <= 15);
+	sprintf(dstname, "R%d%s%s", dst_nr, dstmask, sat);
+	break;
+    case 4:
+	assert(dst_nr == 0);
+	sprintf(dstname, "oC%s%s", dstmask, sat);
+	break;
+    case 5:
+	assert(dst_nr == 0);
+	sprintf(dstname, "oD%s%s",  dstmask, sat);
+	break;
+    case 6:
+	assert(dst_nr <= 3);
+	sprintf(dstname, "U%d%s%s", dst_nr, dstmask, sat);
+	break;
+    default:
+	sprintf(dstname, "RESERVED");
+	break;
+    }
+}
+
+static const char *
+gen3_get_channel_swizzle(uint32_t select)
+{
+    switch (select & 0x7) {
+    case 0:
+	return (select & 8) ? "-x" : "x";
+    case 1:
+	return (select & 8) ? "-y" : "y";
+    case 2:
+	return (select & 8) ? "-z" : "z";
+    case 3:
+	return (select & 8) ? "-w" : "w";
+    case 4:
+	return (select & 8) ? "-0" : "0";
+    case 5:
+	return (select & 8) ? "-1" : "1";
+    default:
+	return (select & 8) ? "-bad" : "bad";
+    }
+}
+
+static void
+gen3_get_instruction_src_name(uint32_t src_type, uint32_t src_nr, char *name)
+{
+	switch (src_type) {
+	case 0:
+		sprintf(name, "R%d", src_nr);
+		assert(src_nr <= 15);
+		break;
+	case 1:
+		if (src_nr < 8)
+			sprintf(name, "T%d", src_nr);
+		else if (src_nr == 8)
+			sprintf(name, "DIFFUSE");
+		else if (src_nr == 9)
+			sprintf(name, "SPECULAR");
+		else if (src_nr == 10)
+			sprintf(name, "FOG");
+		else {
+			assert(0);
+			sprintf(name, "RESERVED");
+		}
+		break;
+	case 2:
+		sprintf(name, "C%d", src_nr);
+		assert(src_nr <= 31);
+		break;
+	case 4:
+		sprintf(name, "oC");
+		assert(src_nr == 0);
+		break;
+	case 5:
+		sprintf(name, "oD");
+		assert(src_nr == 0);
+		break;
+	case 6:
+		sprintf(name, "U%d", src_nr);
+		assert(src_nr <= 3);
+		break;
+	default:
+		sprintf(name, "RESERVED");
+		assert(0);
+		break;
+	}
+}
+
+static void
+gen3_get_instruction_src0(uint32_t *data, int i, char *srcname)
+{
+    uint32_t a0 = data[i];
+    uint32_t a1 = data[i + 1];
+    int src_nr = (a0 >> 2) & 0x1f;
+    const char *swizzle_x = gen3_get_channel_swizzle((a1 >> 28) & 0xf);
+    const char *swizzle_y = gen3_get_channel_swizzle((a1 >> 24) & 0xf);
+    const char *swizzle_z = gen3_get_channel_swizzle((a1 >> 20) & 0xf);
+    const char *swizzle_w = gen3_get_channel_swizzle((a1 >> 16) & 0xf);
+    char swizzle[100];
+
+    gen3_get_instruction_src_name((a0 >> 7) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+gen3_get_instruction_src1(uint32_t *data, int i, char *srcname)
+{
+    uint32_t a1 = data[i + 1];
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a1 >> 8) & 0x1f;
+    const char *swizzle_x = gen3_get_channel_swizzle((a1 >> 4) & 0xf);
+    const char *swizzle_y = gen3_get_channel_swizzle((a1 >> 0) & 0xf);
+    const char *swizzle_z = gen3_get_channel_swizzle((a2 >> 28) & 0xf);
+    const char *swizzle_w = gen3_get_channel_swizzle((a2 >> 24) & 0xf);
+    char swizzle[100];
+
+    gen3_get_instruction_src_name((a1 >> 13) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+gen3_get_instruction_src2(uint32_t *data, int i, char *srcname)
+{
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a2 >> 16) & 0x1f;
+    const char *swizzle_x = gen3_get_channel_swizzle((a2 >> 12) & 0xf);
+    const char *swizzle_y = gen3_get_channel_swizzle((a2 >> 8) & 0xf);
+    const char *swizzle_z = gen3_get_channel_swizzle((a2 >> 4) & 0xf);
+    const char *swizzle_w = gen3_get_channel_swizzle((a2 >> 0) & 0xf);
+    char swizzle[100];
+
+    gen3_get_instruction_src_name((a2 >> 21) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+gen3_get_instruction_addr(uint32_t src_type, uint32_t src_nr, char *name)
+{
+	switch (src_type) {
+	case 0:
+		sprintf(name, "R%d", src_nr);
+		assert(src_nr <= 15);
+		break;
+	case 1:
+		if (src_nr < 8)
+			sprintf(name, "T%d", src_nr);
+		else if (src_nr == 8)
+			sprintf(name, "DIFFUSE");
+		else if (src_nr == 9)
+			sprintf(name, "SPECULAR");
+		else if (src_nr == 10)
+			sprintf(name, "FOG");
+		else {
+			assert(0);
+			sprintf(name, "RESERVED");
+		}
+		break;
+	case 4:
+		sprintf(name, "oC");
+		assert(src_nr == 0);
+		break;
+	case 5:
+		sprintf(name, "oD");
+		assert(src_nr == 0);
+		break;
+	default:
+		assert(0);
+		sprintf(name, "RESERVED");
+		break;
+	}
+}
+
+static void
+gen3_decode_alu1(uint32_t *data, uint32_t offset,
+		 int i, char *instr_prefix, const char *op_name)
+{
+    char dst[100], src0[100];
+
+    gen3_get_instruction_dst(data, i, dst, 1);
+    gen3_get_instruction_src0(data, i, src0);
+
+    kgem_debug_print(data, offset, i++, "%s: %s %s, %s\n", instr_prefix,
+	      op_name, dst, src0);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+gen3_decode_alu2(uint32_t *data, uint32_t offset,
+		 int i, char *instr_prefix, const char *op_name)
+{
+    char dst[100], src0[100], src1[100];
+
+    gen3_get_instruction_dst(data, i, dst, 1);
+    gen3_get_instruction_src0(data, i, src0);
+    gen3_get_instruction_src1(data, i, src1);
+
+    kgem_debug_print(data, offset, i++, "%s: %s %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+gen3_decode_alu3(uint32_t *data, uint32_t offset,
+		 int i, char *instr_prefix, const char *op_name)
+{
+    char dst[100], src0[100], src1[100], src2[100];
+
+    gen3_get_instruction_dst(data, i, dst, 1);
+    gen3_get_instruction_src0(data, i, src0);
+    gen3_get_instruction_src1(data, i, src1);
+    gen3_get_instruction_src2(data, i, src2);
+
+    kgem_debug_print(data, offset, i++, "%s: %s %s, %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1, src2);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+gen3_decode_tex(uint32_t *data, uint32_t offset, int i, char *instr_prefix,
+		const char *tex_name)
+{
+    uint32_t t0 = data[i];
+    uint32_t t1 = data[i + 1];
+    char dst_name[100];
+    char addr_name[100];
+    int sampler_nr;
+
+    gen3_get_instruction_dst(data, i, dst_name, 0);
+    gen3_get_instruction_addr((t1 >> 24) & 0x7,
+			      (t1 >> 17) & 0xf,
+			      addr_name);
+    sampler_nr = t0 & 0xf;
+
+    kgem_debug_print(data, offset, i++, "%s: %s %s, S%d, %s\n", instr_prefix,
+	      tex_name, dst_name, sampler_nr, addr_name);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+    kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+gen3_decode_dcl(uint32_t *data, uint32_t offset, int i, char *instr_prefix)
+{
+	uint32_t d0 = data[i];
+	const char *sampletype;
+	int dcl_nr = (d0 >> 14) & 0xf;
+	const char *dcl_x = d0 & (1 << 10) ? "x" : "";
+	const char *dcl_y = d0 & (1 << 11) ? "y" : "";
+	const char *dcl_z = d0 & (1 << 12) ? "z" : "";
+	const char *dcl_w = d0 & (1 << 13) ? "w" : "";
+	char dcl_mask[10];
+
+	switch ((d0 >> 19) & 0x3) {
+	case 1:
+		sprintf(dcl_mask, ".%s%s%s%s", dcl_x, dcl_y, dcl_z, dcl_w);
+		assert (strcmp(dcl_mask, "."));
+
+		assert(dcl_nr <= 10);
+		if (dcl_nr < 8) {
+			if (strcmp(dcl_mask, ".x") != 0 &&
+			    strcmp(dcl_mask, ".xy") != 0 &&
+			    strcmp(dcl_mask, ".xz") != 0 &&
+			    strcmp(dcl_mask, ".w") != 0 &&
+			    strcmp(dcl_mask, ".xyzw") != 0) {
+				assert(0);
+			}
+			kgem_debug_print(data, offset, i++, "%s: DCL T%d%s\n", instr_prefix,
+				  dcl_nr, dcl_mask);
+		} else {
+			if (strcmp(dcl_mask, ".xz") == 0)
+				assert(0);
+			else if (strcmp(dcl_mask, ".xw") == 0)
+				assert(0);
+			else if (strcmp(dcl_mask, ".xzw") == 0)
+				assert(0);
+
+			if (dcl_nr == 8) {
+				kgem_debug_print(data, offset, i++, "%s: DCL DIFFUSE%s\n", instr_prefix,
+					  dcl_mask);
+			} else if (dcl_nr == 9) {
+				kgem_debug_print(data, offset, i++, "%s: DCL SPECULAR%s\n", instr_prefix,
+					  dcl_mask);
+			} else if (dcl_nr == 10) {
+				kgem_debug_print(data, offset, i++, "%s: DCL FOG%s\n", instr_prefix,
+					  dcl_mask);
+			}
+		}
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+		break;
+	case 3:
+		switch ((d0 >> 22) & 0x3) {
+		case 0:
+			sampletype = "2D";
+			break;
+		case 1:
+			sampletype = "CUBE";
+			break;
+		case 2:
+			sampletype = "3D";
+			break;
+		default:
+			sampletype = "RESERVED";
+			break;
+		}
+		assert(dcl_nr <= 15);
+		kgem_debug_print(data, offset, i++, "%s: DCL S%d %s\n", instr_prefix,
+			  dcl_nr, sampletype);
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+		break;
+	default:
+		kgem_debug_print(data, offset, i++, "%s: DCL RESERVED%d\n", instr_prefix, dcl_nr);
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+		kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+	}
+}
+
+static void
+gen3_decode_instruction(uint32_t *data, uint32_t offset,
+			int i, char *instr_prefix)
+{
+    switch ((data[i] >> 24) & 0x1f) {
+    case 0x0:
+	kgem_debug_print(data, offset, i++, "%s: NOP\n", instr_prefix);
+	kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+	kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+	break;
+    case 0x01:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "ADD");
+	break;
+    case 0x02:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "MOV");
+	break;
+    case 0x03:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "MUL");
+	break;
+    case 0x04:
+	gen3_decode_alu3(data, offset, i, instr_prefix, "MAD");
+	break;
+    case 0x05:
+	gen3_decode_alu3(data, offset, i, instr_prefix, "DP2ADD");
+	break;
+    case 0x06:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "DP3");
+	break;
+    case 0x07:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "DP4");
+	break;
+    case 0x08:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "FRC");
+	break;
+    case 0x09:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "RCP");
+	break;
+    case 0x0a:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "RSQ");
+	break;
+    case 0x0b:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "EXP");
+	break;
+    case 0x0c:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "LOG");
+	break;
+    case 0x0d:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "CMP");
+	break;
+    case 0x0e:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "MIN");
+	break;
+    case 0x0f:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "MAX");
+	break;
+    case 0x10:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "FLR");
+	break;
+    case 0x11:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "MOD");
+	break;
+    case 0x12:
+	gen3_decode_alu1(data, offset, i, instr_prefix, "TRC");
+	break;
+    case 0x13:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "SGE");
+	break;
+    case 0x14:
+	gen3_decode_alu2(data, offset, i, instr_prefix, "SLT");
+	break;
+    case 0x15:
+	gen3_decode_tex(data, offset, i, instr_prefix, "TEXLD");
+	break;
+    case 0x16:
+	gen3_decode_tex(data, offset, i, instr_prefix, "TEXLDP");
+	break;
+    case 0x17:
+	gen3_decode_tex(data, offset, i, instr_prefix, "TEXLDB");
+	break;
+    case 0x19:
+	gen3_decode_dcl(data, offset, i, instr_prefix);
+	break;
+    default:
+	kgem_debug_print(data, offset, i++, "%s: unknown\n", instr_prefix);
+	kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+	kgem_debug_print(data, offset, i++, "%s\n", instr_prefix);
+	break;
+    }
+}
+
+static const char *
+gen3_decode_compare_func(uint32_t op)
+{
+	switch (op&0x7) {
+	case 0: return "always";
+	case 1: return "never";
+	case 2: return "less";
+	case 3: return "equal";
+	case 4: return "lequal";
+	case 5: return "greater";
+	case 6: return "notequal";
+	case 7: return "gequal";
+	}
+	return "";
+}
+
+static const char *
+gen3_decode_stencil_op(uint32_t op)
+{
+	switch (op&0x7) {
+	case 0: return "keep";
+	case 1: return "zero";
+	case 2: return "replace";
+	case 3: return "incr_sat";
+	case 4: return "decr_sat";
+	case 5: return "greater";
+	case 6: return "incr";
+	case 7: return "decr";
+	}
+	return "";
+}
+
+#if 0
+/* part of MODES_4 */
+static const char *
+gen3_decode_logic_op(uint32_t op)
+{
+	switch (op&0xf) {
+	case 0: return "clear";
+	case 1: return "nor";
+	case 2: return "and_inv";
+	case 3: return "copy_inv";
+	case 4: return "and_rvrse";
+	case 5: return "inv";
+	case 6: return "xor";
+	case 7: return "nand";
+	case 8: return "and";
+	case 9: return "equiv";
+	case 10: return "noop";
+	case 11: return "or_inv";
+	case 12: return "copy";
+	case 13: return "or_rvrse";
+	case 14: return "or";
+	case 15: return "set";
+	}
+	return "";
+}
+#endif
+
+static const char *
+gen3_decode_blend_fact(uint32_t op)
+{
+	switch (op&0xf) {
+	case 1: return "zero";
+	case 2: return "one";
+	case 3: return "src_colr";
+	case 4: return "inv_src_colr";
+	case 5: return "src_alpha";
+	case 6: return "inv_src_alpha";
+	case 7: return "dst_alpha";
+	case 8: return "inv_dst_alpha";
+	case 9: return "dst_colr";
+	case 10: return "inv_dst_colr";
+	case 11: return "src_alpha_sat";
+	case 12: return "cnst_colr";
+	case 13: return "inv_cnst_colr";
+	case 14: return "cnst_alpha";
+	case 15: return "inv_const_alpha";
+	}
+	return "";
+}
+
+static const char *
+decode_tex_coord_mode(uint32_t mode)
+{
+    switch (mode&0x7) {
+    case 0: return "wrap";
+    case 1: return "mirror";
+    case 2: return "clamp_edge";
+    case 3: return "cube";
+    case 4: return "clamp_border";
+    case 5: return "mirror_once";
+    }
+    return "";
+}
+
+static const char *
+gen3_decode_sample_filter(uint32_t mode)
+{
+	switch (mode&0x7) {
+	case 0: return "nearest";
+	case 1: return "linear";
+	case 2: return "anisotropic";
+	case 3: return "4x4_1";
+	case 4: return "4x4_2";
+	case 5: return "4x4_flat";
+	case 6: return "6x5_mono";
+	}
+	return "";
+}
+
+static int
+gen3_decode_load_state_immediate_1(struct kgem *kgem, uint32_t offset)
+{
+	const uint32_t *data = kgem->batch + offset;
+	int len, i, word;
+
+	kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 0; word <= 8; word++) {
+		if (data[0] & (1 << (4 + word))) {
+			switch (word) {
+			case 0:
+				kgem_debug_print(data, offset, i, "S0: vbo offset: 0x%08x%s\n",
+					  data[i]&(~1),data[i]&1?", auto cache invalidate disabled":"");
+				gen3_update_vertex_buffer_addr(kgem, offset + i);
+				break;
+			case 1:
+				kgem_debug_print(data, offset, i, "S1: vertex width: %i, vertex pitch: %i\n",
+					  (data[i]>>24)&0x3f,(data[i]>>16)&0x3f);
+				gen3_update_vertex_buffer_pitch(kgem, offset + i);
+				break;
+			case 2:
+				{
+					char buf[200];
+					int len = 0;
+					int tex_num;
+					for (tex_num = 0; tex_num < 8; tex_num++) {
+						switch((data[i]>>tex_num*4)&0xf) {
+						case 0: len += sprintf(buf + len, "%i=2D ", tex_num); break;
+						case 1: len += sprintf(buf + len, "%i=3D ", tex_num); break;
+						case 2: len += sprintf(buf + len, "%i=4D ", tex_num); break;
+						case 3: len += sprintf(buf + len, "%i=1D ", tex_num); break;
+						case 4: len += sprintf(buf + len, "%i=2D_16 ", tex_num); break;
+						case 5: len += sprintf(buf + len, "%i=4D_16 ", tex_num); break;
+						case 0xf: len += sprintf(buf + len, "%i=NP ", tex_num); break;
+						}
+					}
+					kgem_debug_print(data, offset, i, "S2: texcoord formats: %s\n", buf);
+					gen3_update_vertex_texcoords(kgem, data[i]);
+				}
+
+				break;
+			case 3:
+				kgem_debug_print(data, offset, i, "S3: not documented\n");
+				break;
+			case 4:
+				{
+					const char *cullmode = "";
+					const char *vfmt_xyzw = "";
+					switch((data[i]>>13)&0x3) {
+					case 0: cullmode = "both"; break;
+					case 1: cullmode = "none"; break;
+					case 2: cullmode = "cw"; break;
+					case 3: cullmode = "ccw"; break;
+					}
+					switch(data[i] & (7<<6 | 1<<2)) {
+					case 1<<6: vfmt_xyzw = "XYZ,"; break;
+					case 2<<6: vfmt_xyzw = "XYZW,"; break;
+					case 3<<6: vfmt_xyzw = "XY,"; break;
+					case 4<<6: vfmt_xyzw = "XYW,"; break;
+					case 1<<6 | 1<<2: vfmt_xyzw = "XYZF,"; break;
+					case 2<<6 | 1<<2: vfmt_xyzw = "XYZWF,"; break;
+					case 3<<6 | 1<<2: vfmt_xyzw = "XYF,"; break;
+					case 4<<6 | 1<<2: vfmt_xyzw = "XYWF,"; break;
+					}
+					kgem_debug_print(data, offset, i, "S4: point_width=%i, line_width=%.1f,"
+						  "%s%s%s%s%s cullmode=%s, vfmt=%s%s%s%s%s%s%s%s "
+						  "%s%s%s\n",
+						  (data[i]>>23)&0x1ff,
+						  ((data[i]>>19)&0xf) / 2.0,
+						  data[i]&(0xf<<15)?" flatshade=":"",
+						  data[i]&(1<<18)?"Alpha,":"",
+						  data[i]&(1<<17)?"Fog,":"",
+						  data[i]&(1<<16)?"Specular,":"",
+						  data[i]&(1<<15)?"Color,":"",
+						  cullmode,
+						  data[i]&(1<<12)?"PointWidth,":"",
+						  data[i]&(1<<11)?"SpecFog,":"",
+						  data[i]&(1<<10)?"Color,":"",
+						  data[i]&(1<<9)?"DepthOfs,":"",
+						  vfmt_xyzw,
+						  data[i]&(1<<9)?"FogParam,":"",
+						  data[i]&(1<<5)?"force default diffuse, ":"",
+						  data[i]&(1<<4)?"force default specular, ":"",
+						  data[i]&(1<<3)?"local depth ofs enable, ":"",
+						  data[i]&(1<<1)?"point sprite enable, ":"",
+						  data[i]&(1<<0)?"line AA enable, ":"");
+					gen3_update_vertex_elements(kgem, data[i]);
+					break;
+				}
+			case 5:
+				{
+					kgem_debug_print(data, offset, i, "S5:%s%s%s%s%s"
+						  "%s%s%s%s stencil_ref=0x%x, stencil_test=%s, "
+						  "stencil_fail=%s, stencil_pass_z_fail=%s, "
+						  "stencil_pass_z_pass=%s, %s%s%s%s\n",
+						  data[i]&(0xf<<28)?" write_disable=":"",
+						  data[i]&(1<<31)?"Alpha,":"",
+						  data[i]&(1<<30)?"Red,":"",
+						  data[i]&(1<<29)?"Green,":"",
+						  data[i]&(1<<28)?"Blue,":"",
+						  data[i]&(1<<27)?" force default point size,":"",
+						  data[i]&(1<<26)?" last pixel enable,":"",
+						  data[i]&(1<<25)?" global depth ofs enable,":"",
+						  data[i]&(1<<24)?" fog enable,":"",
+						  (data[i]>>16)&0xff,
+						  gen3_decode_compare_func(data[i]>>13),
+						  gen3_decode_stencil_op(data[i]>>10),
+						  gen3_decode_stencil_op(data[i]>>7),
+						  gen3_decode_stencil_op(data[i]>>4),
+						  data[i]&(1<<3)?"stencil write enable, ":"",
+						  data[i]&(1<<2)?"stencil test enable, ":"",
+						  data[i]&(1<<1)?"color dither enable, ":"",
+						  data[i]&(1<<0)?"logicop enable, ":"");
+				}
+				break;
+			case 6:
+				kgem_debug_print(data, offset, i, "S6: %salpha_test=%s, alpha_ref=0x%x, "
+					  "depth_test=%s, %ssrc_blnd_fct=%s, dst_blnd_fct=%s, "
+					  "%s%stristrip_provoking_vertex=%i\n",
+					  data[i]&(1<<31)?"alpha test enable, ":"",
+					  gen3_decode_compare_func(data[i]>>28),
+					  data[i]&(0xff<<20),
+					  gen3_decode_compare_func(data[i]>>16),
+					  data[i]&(1<<15)?"cbuf blend enable, ":"",
+					  gen3_decode_blend_fact(data[i]>>8),
+					  gen3_decode_blend_fact(data[i]>>4),
+					  data[i]&(1<<3)?"depth write enable, ":"",
+					  data[i]&(1<<2)?"cbuf write enable, ":"",
+					  data[i]&(0x3));
+				break;
+			case 7:
+				kgem_debug_print(data, offset, i, "S7: depth offset constant: 0x%08x\n", data[i]);
+				break;
+			}
+			i++;
+		}
+	}
+
+	assert(len == i);
+	return len;
+}
+
+static int
+gen3_decode_3d_1d(struct kgem *kgem, uint32_t offset)
+{
+	uint32_t *data = kgem->batch + offset;
+	unsigned int len, i, c, idx, word, map, sampler, instr;
+	const char *format, *zformat, *type;
+	uint32_t opcode;
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes_3d_1d[] = {
+		{ 0x86, 4, 4, "3DSTATE_CHROMA_KEY" },
+		{ 0x88, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" },
+		{ 0x99, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" },
+		{ 0x9a, 2, 2, "3DSTATE_DEFAULT_SPECULAR" },
+		{ 0x98, 2, 2, "3DSTATE_DEFAULT_Z" },
+		{ 0x97, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" },
+		{ 0x9d, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" },
+		{ 0x9e, 4, 4, "3DSTATE_MONO_FILTER" },
+		{ 0x89, 4, 4, "3DSTATE_FOG_MODE" },
+		{ 0x8f, 2, 16, "3DSTATE_MAP_PALLETE_LOAD_32" },
+		{ 0x83, 2, 2, "3DSTATE_SPAN_STIPPLE" },
+	}, *opcode_3d_1d;
+
+	opcode = (data[0] & 0x00ff0000) >> 16;
+
+	switch (opcode) {
+	case 0x07:
+		/* This instruction is unusual.  A 0 length means just 1 DWORD instead of
+		 * 2.  The 0 length is specified in one place to be unsupported, but
+		 * stated to be required in another, and 0 length LOAD_INDIRECTs appear
+		 * to cause no harm at least.
+		 */
+		kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_INDIRECT\n");
+		len = (data[0] & 0x000000ff) + 1;
+		i = 1;
+		if (data[0] & (0x01 << 8)) {
+			kgem_debug_print(data, offset, i++, "SIS.0\n");
+			kgem_debug_print(data, offset, i++, "SIS.1\n");
+		}
+		if (data[0] & (0x02 << 8)) {
+			kgem_debug_print(data, offset, i++, "DIS.0\n");
+		}
+		if (data[0] & (0x04 << 8)) {
+			kgem_debug_print(data, offset, i++, "SSB.0\n");
+			kgem_debug_print(data, offset, i++, "SSB.1\n");
+		}
+		if (data[0] & (0x08 << 8)) {
+			kgem_debug_print(data, offset, i++, "MSB.0\n");
+			kgem_debug_print(data, offset, i++, "MSB.1\n");
+		}
+		if (data[0] & (0x10 << 8)) {
+			kgem_debug_print(data, offset, i++, "PSP.0\n");
+			kgem_debug_print(data, offset, i++, "PSP.1\n");
+		}
+		if (data[0] & (0x20 << 8)) {
+			kgem_debug_print(data, offset, i++, "PSC.0\n");
+			kgem_debug_print(data, offset, i++, "PSC.1\n");
+		}
+		assert(len == i);
+		return len;
+	case 0x04:
+		return gen3_decode_load_state_immediate_1(kgem, offset);
+	case 0x03:
+		kgem_debug_print(data, offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_2\n");
+		len = (data[0] & 0x0000000f) + 2;
+		i = 1;
+		for (word = 6; word <= 14; word++) {
+			if (data[0] & (1 << word)) {
+				if (word == 6)
+					kgem_debug_print(data, offset, i++, "TBCF\n");
+				else if (word >= 7 && word <= 10) {
+					kgem_debug_print(data, offset, i++, "TB%dC\n", word - 7);
+					kgem_debug_print(data, offset, i++, "TB%dA\n", word - 7);
+				} else if (word >= 11 && word <= 14) {
+					kgem_debug_print(data, offset, i, "TM%dS0: offset=0x%08x, %s\n",
+						  word - 11,
+						  data[i]&0xfffffffe,
+						  data[i]&1?"use fence":"");
+					i++;
+					kgem_debug_print(data, offset, i, "TM%dS1: height=%i, width=%i, %s\n",
+						  word - 11,
+						  data[i]>>21, (data[i]>>10)&0x3ff,
+						  data[i]&2?(data[i]&1?"y-tiled":"x-tiled"):"");
+					i++;
+					kgem_debug_print(data, offset, i, "TM%dS2: pitch=%i, \n",
+						  word - 11,
+						  ((data[i]>>21) + 1)*4);
+					i++;
+					kgem_debug_print(data, offset, i++, "TM%dS3\n", word - 11);
+					kgem_debug_print(data, offset, i++, "TM%dS4: dflt color\n", word - 11);
+				}
+			}
+		}
+		assert(len == i);
+		return len;
+	case 0x00:
+		kgem_debug_print(data, offset, 0, "3DSTATE_MAP_STATE\n");
+		len = (data[0] & 0x0000003f) + 2;
+		kgem_debug_print(data, offset, 1, "mask\n");
+
+		i = 2;
+		for (map = 0; map <= 15; map++) {
+			if (data[1] & (1 << map)) {
+				int width, height, pitch, dword;
+				struct drm_i915_gem_relocation_entry *reloc;
+				const char *tiling;
+
+				reloc = kgem_debug_get_reloc_entry(kgem, &data[i] - kgem->batch);
+				assert(reloc->target_handle);
+
+				dword = data[i];
+				kgem_debug_print(data, offset, i++, "map %d MS2 %s%s%s, handle=%d\n", map,
+					  dword&(1<<31)?"untrusted surface, ":"",
+					  dword&(1<<1)?"vertical line stride enable, ":"",
+					  dword&(1<<0)?"vertical ofs enable, ":"",
+					  reloc->target_handle);
+
+				dword = data[i];
+				width = ((dword >> 10) & ((1 << 11) - 1))+1;
+				height = ((dword >> 21) & ((1 << 11) - 1))+1;
+
+				tiling = "none";
+				if (dword & (1 << 2))
+					tiling = "fenced";
+				else if (dword & (1 << 1))
+					tiling = dword & (1 << 0) ? "Y" : "X";
+				type = " BAD";
+				format = " (invalid)";
+				switch ((dword>>7) & 0x7) {
+				case 1:
+					type = "8";
+					switch ((dword>>3) & 0xf) {
+					case 0: format = "I"; break;
+					case 1: format = "L"; break;
+					case 4: format = "A"; break;
+					case 5: format = " mono"; break;
+					}
+					break;
+				case 2:
+					type = "16";
+					switch ((dword>>3) & 0xf) {
+					case 0: format = " rgb565"; break;
+					case 1: format = " argb1555"; break;
+					case 2: format = " argb4444"; break;
+					case 3: format = " ay88"; break;
+					case 5: format = " 88dvdu"; break;
+					case 6: format = " bump655"; break;
+					case 7: format = "I"; break;
+					case 8: format = "L"; break;
+					case 9: format = "A"; break;
+					}
+					break;
+				case 3:
+					type = "32";
+					switch ((dword>>3) & 0xf) {
+					case 0: format = " argb8888"; break;
+					case 1: format = " abgr8888"; break;
+					case 2: format = " xrgb8888"; break;
+					case 3: format = " xbgr8888"; break;
+					case 4: format = " qwvu8888"; break;
+					case 5: format = " axvu8888"; break;
+					case 6: format = " lxvu8888"; break;
+					case 7: format = " xlvu8888"; break;
+					case 8: format = " argb2101010"; break;
+					case 9: format = " abgr2101010"; break;
+					case 10: format = " awvu2101010"; break;
+					case 11: format = " gr1616"; break;
+					case 12: format = " vu1616"; break;
+					case 13: format = " xI824"; break;
+					case 14: format = " xA824"; break;
+					case 15: format = " xL824"; break;
+					}
+					break;
+				case 5:
+					type = "422";
+					switch ((dword>>3) & 0xf) {
+					case 0: format = " yuv_swapy"; break;
+					case 1: format = " yuv"; break;
+					case 2: format = " yuv_swapuv"; break;
+					case 3: format = " yuv_swapuvy"; break;
+					}
+					break;
+				case 6:
+					type = "compressed";
+					switch ((dword>>3) & 0x7) {
+					case 0: format = " dxt1"; break;
+					case 1: format = " dxt2_3"; break;
+					case 2: format = " dxt4_5"; break;
+					case 3: format = " fxt1"; break;
+					case 4: format = " dxt1_rb"; break;
+					}
+					break;
+				case 7:
+					type = "4b indexed";
+					switch ((dword>>3) & 0xf) {
+					case 7: format = " argb8888"; break;
+					}
+					break;
+				default:
+					format = "BAD";
+					break;
+				}
+				dword = data[i];
+				kgem_debug_print(data, offset, i++, "map %d MS3 [width=%d, height=%d, format=%s%s, tiling=%s%s]\n",
+					  map, width, height, type, format, tiling,
+					  dword&(1<<9)?" palette select":"");
+
+				dword = data[i];
+				pitch = 4*(((dword >> 21) & ((1 << 11) - 1))+1);
+				kgem_debug_print(data, offset, i++, "map %d MS4 [pitch=%d, max_lod=%i, vol_depth=%i, cube_face_ena=%x, %s]\n",
+					  map, pitch,
+					  (dword>>9)&0x3f, dword&0xff, (dword>>15)&0x3f,
+					  dword&(1<<8)?"miplayout legacy":"miplayout right");
+			}
+		}
+		assert(len == i);
+		return len;
+	case 0x06:
+		kgem_debug_print(data, offset, 0, "3DSTATE_PIXEL_SHADER_CONSTANTS\n");
+		len = (data[0] & 0x000000ff) + 2;
+
+		i = 2;
+		for (c = 0; c <= 31; c++) {
+			if (data[1] & (1 << c)) {
+				kgem_debug_print(data, offset, i, "C%d.X = %f\n",
+					  c, int_as_float(data[i]));
+				i++;
+				kgem_debug_print(data, offset, i, "C%d.Y = %f\n",
+					  c, int_as_float(data[i]));
+				i++;
+				kgem_debug_print(data, offset, i, "C%d.Z = %f\n",
+					  c, int_as_float(data[i]));
+				i++;
+				kgem_debug_print(data, offset, i, "C%d.W = %f\n",
+					  c, int_as_float(data[i]));
+				i++;
+			}
+		}
+		assert(len == i);
+		return len;
+	case 0x05:
+		kgem_debug_print(data, offset, 0, "3DSTATE_PIXEL_SHADER_PROGRAM\n");
+		len = (data[0] & 0x000000ff) + 2;
+		assert(((len-1) % 3) == 0);
+		assert(len <= 370);
+		i = 1;
+		for (instr = 0; instr < (len - 1) / 3; instr++) {
+			char instr_prefix[10];
+
+			sprintf(instr_prefix, "PS%03d", instr);
+			gen3_decode_instruction(data, offset, i, instr_prefix);
+			i += 3;
+		}
+		return len;
+	case 0x01:
+		kgem_debug_print(data, offset, 0, "3DSTATE_SAMPLER_STATE\n");
+		kgem_debug_print(data, offset, 1, "mask\n");
+		len = (data[0] & 0x0000003f) + 2;
+		i = 2;
+		for (sampler = 0; sampler <= 15; sampler++) {
+			if (data[1] & (1 << sampler)) {
+				uint32_t dword;
+				const char *mip_filter = "";
+				dword = data[i];
+				switch ((dword>>20)&0x3) {
+				case 0: mip_filter = "none"; break;
+				case 1: mip_filter = "nearest"; break;
+				case 3: mip_filter = "linear"; break;
+				}
+				kgem_debug_print(data, offset, i++, "sampler %d SS2:%s%s%s "
+					  "base_mip_level=%i, mip_filter=%s, mag_filter=%s, min_filter=%s "
+					  "lod_bias=%.2f,%s max_aniso=%i, shadow_func=%s\n", sampler,
+					  dword&(1<<31)?" reverse gamma,":"",
+					  dword&(1<<30)?" packed2planar,":"",
+					  dword&(1<<29)?" colorspace conversion,":"",
+					  (dword>>22)&0x1f,
+					  mip_filter,
+					  gen3_decode_sample_filter(dword>>17),
+					  gen3_decode_sample_filter(dword>>14),
+					  ((dword>>5)&0x1ff)/(0x10*1.0),
+					  dword&(1<<4)?" shadow,":"",
+					  dword&(1<<3)?4:2,
+					  gen3_decode_compare_func(dword));
+				dword = data[i];
+				kgem_debug_print(data, offset, i++, "sampler %d SS3: min_lod=%.2f,%s "
+					  "tcmode_x=%s, tcmode_y=%s, tcmode_z=%s,%s texmap_idx=%i,%s\n",
+					  sampler, ((dword>>24)&0xff)/(0x10*1.0),
+					  dword&(1<<17)?" kill pixel enable,":"",
+					  decode_tex_coord_mode(dword>>12),
+					  decode_tex_coord_mode(dword>>9),
+					  decode_tex_coord_mode(dword>>6),
+					  dword&(1<<5)?" normalized coords,":"",
+					  (dword>>1)&0xf,
+					  dword&(1<<0)?" deinterlacer,":"");
+				kgem_debug_print(data, offset, i++, "sampler %d SS4: border color\n",
+					  sampler);
+			}
+		}
+		assert(len == i);
+		return len;
+	case 0x85:
+		len = (data[0] & 0x0000000f) + 2;
+		assert(len == 2);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+		switch ((data[1] >> 8) & 0xf) {
+		case 0x0: format = "g8"; break;
+		case 0x1: format = "x1r5g5b5"; break;
+		case 0x2: format = "r5g6b5"; break;
+		case 0x3: format = "a8r8g8b8"; break;
+		case 0x4: format = "ycrcb_swapy"; break;
+		case 0x5: format = "ycrcb_normal"; break;
+		case 0x6: format = "ycrcb_swapuv"; break;
+		case 0x7: format = "ycrcb_swapuvy"; break;
+		case 0x8: format = "a4r4g4b4"; break;
+		case 0x9: format = "a1r5g5b5"; break;
+		case 0xa: format = "a2r10g10b10"; break;
+		default: format = "BAD"; break;
+		}
+		switch ((data[1] >> 2) & 0x3) {
+		case 0x0: zformat = "u16"; break;
+		case 0x1: zformat = "f16"; break;
+		case 0x2: zformat = "u24x8"; break;
+		default: zformat = "BAD"; break;
+		}
+		kgem_debug_print(data, offset, 1, "%s format, %s depth format, early Z %sabled\n",
+			  format, zformat,
+			  (data[1] & (1 << 31)) ? "en" : "dis");
+		return len;
+
+	case 0x8e:
+		{
+			const char *name, *tiling;
+
+			len = (data[0] & 0x0000000f) + 2;
+			assert(len == 3);
+
+			switch((data[1] >> 24) & 0x7) {
+			case 0x3: name = "color"; break;
+			case 0x7: name = "depth"; break;
+			default: name = "unknown"; break;
+			}
+
+			tiling = "none";
+			if (data[1] & (1 << 23))
+				tiling = "fenced";
+			else if (data[1] & (1 << 22))
+				tiling = data[1] & (1 << 21) ? "Y" : "X";
+
+			kgem_debug_print(data, offset, 0, "3DSTATE_BUFFER_INFO\n");
+			kgem_debug_print(data, offset, 1, "%s, tiling = %s, pitch=%d\n", name, tiling, data[1]&0xffff);
+
+			kgem_debug_print(data, offset, 2, "address\n");
+			return len;
+		}
+	case 0x81:
+		len = (data[0] & 0x0000000f) + 2;
+		assert(len == 3);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_SCISSOR_RECTANGLE\n");
+		kgem_debug_print(data, offset, 1, "(%d,%d)\n",
+			  data[1] & 0xffff, data[1] >> 16);
+		kgem_debug_print(data, offset, 2, "(%d,%d)\n",
+			  data[2] & 0xffff, data[2] >> 16);
+
+		return len;
+	case 0x80:
+		len = (data[0] & 0x0000000f) + 2;
+		assert(len == 5);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DRAWING_RECTANGLE\n");
+		kgem_debug_print(data, offset, 1, "%s\n",
+			  data[1]&(1<<30)?"depth ofs disabled ":"");
+		kgem_debug_print(data, offset, 2, "(%d,%d)\n",
+			  data[2] & 0xffff, data[2] >> 16);
+		kgem_debug_print(data, offset, 3, "(%d,%d)\n",
+			  data[3] & 0xffff, data[3] >> 16);
+		kgem_debug_print(data, offset, 4, "(%d,%d)\n",
+			  (int16_t)(data[4] & 0xffff),
+			  (int16_t)(data[4] >> 16));
+
+		return len;
+	case 0x9c:
+		len = (data[0] & 0x0000000f) + 2;
+		assert(len == 7);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_CLEAR_PARAMETERS\n");
+		kgem_debug_print(data, offset, 1, "prim_type=%s, clear=%s%s%s\n",
+			  data[1]&(1<<16)?"CLEAR_RECT":"ZONE_INIT",
+			  data[1]&(1<<2)?"color,":"",
+			  data[1]&(1<<1)?"depth,":"",
+			  data[1]&(1<<0)?"stencil,":"");
+		kgem_debug_print(data, offset, 2, "clear color\n");
+		kgem_debug_print(data, offset, 3, "clear depth/stencil\n");
+		kgem_debug_print(data, offset, 4, "color value (rgba8888)\n");
+		kgem_debug_print(data, offset, 5, "depth value %f\n",
+			  int_as_float(data[5]));
+		kgem_debug_print(data, offset, 6, "clear stencil\n");
+		return len;
+	}
+
+	for (idx = 0; idx < ARRAY_SIZE(opcodes_3d_1d); idx++) {
+		opcode_3d_1d = &opcodes_3d_1d[idx];
+		if (((data[0] & 0x00ff0000) >> 16) == opcode_3d_1d->opcode) {
+			len = (data[0] & 0xf) + 2;
+			kgem_debug_print(data, offset, 0, "%s\n", opcode_3d_1d->name);
+			for (i = 1; i < len; i++)
+				kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+			return len;
+		}
+	}
+
+	kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d_1d opcode = 0x%x\n", opcode);
+	assert(0);
+	return 1;
+}
+
+#define VERTEX_OUT(fmt, ...) do {					\
+	kgem_debug_print(data, offset, i, " V%d."fmt"\n", vertex, __VA_ARGS__); \
+	i++;								\
+} while (0)
+
+static int
+gen3_decode_3d_primitive(struct kgem *kgem, uint32_t offset)
+{
+	uint32_t *data = kgem->batch + offset;
+	char immediate = (data[0] & (1 << 23)) == 0;
+	unsigned int len, i, ret;
+	const char *primtype;
+	unsigned int vertex = 0;
+
+	switch ((data[0] >> 18) & 0xf) {
+	case 0x0: primtype = "TRILIST"; break;
+	case 0x1: primtype = "TRISTRIP"; break;
+	case 0x2: primtype = "TRISTRIP_REVERSE"; break;
+	case 0x3: primtype = "TRIFAN"; break;
+	case 0x4: primtype = "POLYGON"; break;
+	case 0x5: primtype = "LINELIST"; break;
+	case 0x6: primtype = "LINESTRIP"; break;
+	case 0x7: primtype = "RECTLIST"; break;
+	case 0x8: primtype = "POINTLIST"; break;
+	case 0x9: primtype = "DIB"; break;
+	case 0xa: primtype = "CLEAR_RECT"; assert(0); break;
+	default: primtype = "unknown"; break;
+	}
+
+	gen3_update_vertex_elements_offsets(kgem);
+
+	/* XXX: 3DPRIM_DIB not supported */
+	if (immediate) {
+		len = (data[0] & 0x0003ffff) + 2;
+		kgem_debug_print(data, offset, 0, "3DPRIMITIVE inline %s\n", primtype);
+		for (i = 1; i < len; ) {
+			ErrorF("    [%d]: ", vertex);
+			i += inline_vertex_out(kgem, data + i) / sizeof(uint32_t);
+			ErrorF("\n");
+			vertex++;
+		}
+
+		ret = len;
+	} else {
+		/* indirect vertices */
+		len = data[0] & 0x0000ffff; /* index count */
+		if (data[0] & (1 << 17)) {
+			/* random vertex access */
+			kgem_debug_print(data, offset, 0,
+				  "3DPRIMITIVE random indirect %s (%d)\n", primtype, len);
+			assert(0);
+			if (len == 0) {
+				/* vertex indices continue until 0xffff is found */
+			} else {
+				/* fixed size vertex index buffer */
+			}
+			ret = (len + 1) / 2 + 1;
+			goto out;
+		} else {
+			/* sequential vertex access */
+			vertex = data[1] & 0xffff;
+			kgem_debug_print(data, offset, 0,
+				  "3DPRIMITIVE sequential indirect %s, %d starting from "
+				  "%d\n", primtype, len, vertex);
+			kgem_debug_print(data, offset, 1, "  start\n");
+			for (i = 0; i < len; i++) {
+				ErrorF("    [%d]: ", vertex);
+				indirect_vertex_out(kgem, vertex++);
+				ErrorF("\n");
+			}
+			ret = 2;
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
+int kgem_gen3_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+    static const struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	const char *name;
+    } opcodes[] = {
+	{ 0x06, 1, 1, "3DSTATE_ANTI_ALIASING" },
+	{ 0x08, 1, 1, "3DSTATE_BACKFACE_STENCIL_OPS" },
+	{ 0x09, 1, 1, "3DSTATE_BACKFACE_STENCIL_MASKS" },
+	{ 0x16, 1, 1, "3DSTATE_COORD_SET_BINDINGS" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x0b, 1, 1, "3DSTATE_INDEPENDENT_ALPHA_BLEND" },
+	{ 0x0d, 1, 1, "3DSTATE_MODES_4" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+    };
+    uint32_t *data = kgem->batch + offset;
+    uint32_t opcode;
+    unsigned int idx;
+
+    opcode = (data[0] & 0x1f000000) >> 24;
+
+    switch (opcode) {
+    case 0x1f:
+	return gen3_decode_3d_primitive(kgem, offset);
+    case 0x1d:
+	return gen3_decode_3d_1d(kgem, offset);
+    case 0x1c:
+	return gen3_decode_3d_1c(kgem, offset);
+    }
+
+    for (idx = 0; idx < ARRAY_SIZE(opcodes); idx++) {
+	if (opcode == opcodes[idx].opcode) {
+	    unsigned int len = 1, i;
+
+	    kgem_debug_print(data, offset, 0, "%s\n", opcodes[idx].name);
+	    if (opcodes[idx].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		assert(len >= opcodes[idx].min_len ||
+		       len <= opcodes[idx].max_len);
+	    }
+
+	    for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+	    return len;
+	}
+    }
+
+    kgem_debug_print(data, offset, 0, "3D UNKNOWN: 3d opcode = 0x%x\n", opcode);
+    return 1;
+}
+
+
+void kgem_gen3_finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
diff --git a/cogl/driver/drm/kgem_debug_gen4.c b/cogl/driver/drm/kgem_debug_gen4.c
new file mode 100644
index 00000000..9b80dc88
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen4.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include "gen4_render.h"
+
+#include "kgem_debug.h"
+
+static struct state {
+	struct vertex_buffer {
+		int handle;
+		void *base;
+		const char *ptr;
+		int pitch;
+
+		struct kgem_bo *current;
+	} vb[33];
+	struct vertex_elements {
+		int buffer;
+		int offset;
+		bool valid;
+		uint32_t type;
+		uint8_t swizzle[4];
+	} ve[33];
+	int num_ve;
+
+	struct dynamic_state {
+		struct kgem_bo *current;
+		void *base, *ptr;
+	} dynamic_state;
+} state;
+
+static void gen4_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
+{
+	uint32_t reloc = sizeof(uint32_t) * (&data[1] - kgem->batch);
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i;
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == reloc)
+			break;
+	assert(i < kgem->nreloc);
+	reloc = kgem->reloc[i].target_handle;
+
+	if (reloc == 0) {
+		base = kgem->batch;
+	} else {
+		list_for_each_entry(bo, &kgem->next_request->buffers, request)
+			if (bo->handle == reloc)
+				break;
+		assert(&bo->request != &kgem->next_request->buffers);
+		base = kgem_bo_map__debug(kgem, bo);
+	}
+	ptr = (char *)base + kgem->reloc[i].delta;
+
+	i = data[0] >> 27;
+
+	state.vb[i].current = bo;
+	state.vb[i].base = base;
+	state.vb[i].ptr = ptr;
+	state.vb[i].pitch = data[0] & 0x7ff;
+}
+
+static uint32_t
+get_ve_component(uint32_t data, int component)
+{
+	return (data >> (16 + (3 - component) * 4)) & 0x7;
+}
+
+static void gen4_update_vertex_elements(struct kgem *kgem, int id, const uint32_t *data)
+{
+	state.ve[id].buffer = data[0] >> 27;
+	state.ve[id].valid = !!(data[0] & (1 << 26));
+	state.ve[id].type = (data[0] >> 16) & 0x1ff;
+	state.ve[id].offset = data[0] & 0x7ff;
+	state.ve[id].swizzle[0] = get_ve_component(data[1], 0);
+	state.ve[id].swizzle[1] = get_ve_component(data[1], 1);
+	state.ve[id].swizzle[2] = get_ve_component(data[1], 2);
+	state.ve[id].swizzle[3] = get_ve_component(data[1], 3);
+}
+
+static void vertices_sint16_out(const struct vertex_elements *ve, const int16_t *v, int max)
+{
+	int c;
+
+	ErrorF("(");
+	for (c = 0; c < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%d", v[c]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void vertices_float_out(const struct vertex_elements *ve, const float *f, int max)
+{
+	int c, o;
+
+	ErrorF("(");
+	for (c = o = 0; c < 4 && o < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%f", f[o++]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void ve_out(const struct vertex_elements *ve, const void *ptr)
+{
+	switch (ve->type) {
+	case GEN4_SURFACEFORMAT_R32_FLOAT:
+		vertices_float_out(ve, ptr, 1);
+		break;
+	case GEN4_SURFACEFORMAT_R32G32_FLOAT:
+		vertices_float_out(ve, ptr, 2);
+		break;
+	case GEN4_SURFACEFORMAT_R32G32B32_FLOAT:
+		vertices_float_out(ve, ptr, 3);
+		break;
+	case GEN4_SURFACEFORMAT_R32G32B32A32_FLOAT:
+		vertices_float_out(ve, ptr, 4);
+		break;
+	case GEN4_SURFACEFORMAT_R16_SINT:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN4_SURFACEFORMAT_R16G16_SINT:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN4_SURFACEFORMAT_R16G16B16A16_SINT:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	case GEN4_SURFACEFORMAT_R16_SSCALED:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN4_SURFACEFORMAT_R16G16_SSCALED:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN4_SURFACEFORMAT_R16G16B16A16_SSCALED:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	}
+}
+
+static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
+{
+	int i = 0;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const struct vertex_buffer *vb = &state.vb[ve->buffer];
+		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
+
+		if (!ve->valid)
+			continue;
+
+		ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+}
+
+static void primitive_out(struct kgem *kgem, uint32_t *data)
+{
+	int n;
+
+	assert((data[0] & (1<<15)) == 0); /* XXX index buffers */
+
+	for (n = 0; n < data[1]; n++) {
+		int v = data[2] + n;
+		ErrorF("	[%d:%d] = ", n, v);
+		indirect_vertex_out(kgem, v);
+		ErrorF("\n");
+	}
+}
+
+static void
+state_base_out(uint32_t *data, uint32_t offset, unsigned int index,
+	       const char *name)
+{
+    if (data[index] & 1)
+	kgem_debug_print(data, offset, index,
+		  "%s state base address 0x%08x\n",
+		  name, data[index] & ~1);
+    else
+	kgem_debug_print(data, offset, index,
+		  "%s state base not updated\n",
+		  name);
+}
+
+static void
+state_max_out(uint32_t *data, uint32_t offset, unsigned int index,
+	      const char *name)
+{
+	if (data[index] == 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound disabled\n", name);
+	else if (data[index] & 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound 0x%08x\n",
+			  name, data[index] & ~1);
+	else
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound not updated\n",
+			  name);
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+	switch (surfacetype) {
+	case 0: return "1D";
+	case 1: return "2D";
+	case 2: return "3D";
+	case 3: return "CUBE";
+	case 4: return "BUFFER";
+	case 7: return "NULL";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+	switch (depthformat) {
+	case 0: return "s8_z24float";
+	case 1: return "z32float";
+	case 2: return "z24s8";
+	case 5: return "z16";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+	uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+	switch (component_control) {
+	case 0:
+		return "nostore";
+	case 1:
+		switch (component) {
+		case 0: return "X";
+		case 1: return "Y";
+		case 2: return "Z";
+		case 3: return "W";
+		default: return "fail";
+		}
+	case 2:
+		return "0.0";
+	case 3:
+		return "1.0";
+	case 4:
+		return "0x1";
+	case 5:
+		return "VID";
+	default:
+		return "fail";
+	}
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+	uint32_t primtype = (data >> 10) & 0x1f;
+
+	switch (primtype) {
+	case 0x01: return "point list";
+	case 0x02: return "line list";
+	case 0x03: return "line strip";
+	case 0x04: return "tri list";
+	case 0x05: return "tri strip";
+	case 0x06: return "tri fan";
+	case 0x07: return "quad list";
+	case 0x08: return "quad strip";
+	case 0x09: return "line list adj";
+	case 0x0a: return "line strip adj";
+	case 0x0b: return "tri list adj";
+	case 0x0c: return "tri strip adj";
+	case 0x0d: return "tri strip reverse";
+	case 0x0e: return "polygon";
+	case 0x0f: return "rect list";
+	case 0x10: return "line loop";
+	case 0x11: return "point list bf";
+	case 0x12: return "line strip cont";
+	case 0x13: return "line strip bf";
+	case 0x14: return "line strip cont bf";
+	case 0x15: return "tri fan no stipple";
+	default: return "fail";
+	}
+}
+
+#if 0
+struct reloc {
+	struct kgem_bo *bo;
+	void *base;
+};
+
+static void *
+get_reloc(struct kgem *kgem,
+	  void *base, const uint32_t *reloc,
+	  struct reloc *r)
+{
+	uint32_t delta = *reloc;
+
+	memset(r, 0, sizeof(*r));
+
+	if (base == 0) {
+		uint32_t handle = sizeof(uint32_t) * (reloc - kgem->batch);
+		struct kgem_bo *bo = NULL;
+		int i;
+
+		for (i = 0; i < kgem->nreloc; i++)
+			if (kgem->reloc[i].offset == handle)
+				break;
+		assert(i < kgem->nreloc);
+		handle = kgem->reloc[i].target_handle;
+		delta = kgem->reloc[i].delta;
+
+		if (handle == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == handle)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map__debug(kgem, bo);
+			r->bo = bo;
+			r->base = base;
+		}
+	}
+
+	return (char *)base + delta;
+}
+#endif
+
+int kgem_gen4_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x6000, 3, 3, "URB_FENCE" },
+		{ 0x6001, 2, 2, "CS_URB_FENCE" },
+		{ 0x6002, 2, 2, "CONSTANT_BUFFER" },
+		{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+		{ 0x6102, 2, 2 , "STATE_SIP" },
+		{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x680b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x6904, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x7800, 7, 7, "3DSTATE_PIPELINED_POINTERS" },
+		{ 0x7801, 6, 6, "3DSTATE_BINDING_TABLE_POINTERS" },
+		{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
+		{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
+		{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+		{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+		{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
+		{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
+		{ 0x7906, 2, 2, "3DSTATE_POLY_STIPPLE_OFFSET" },
+		{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
+		{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
+		{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+		{ 0x7909, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+		{ 0x790b, 4, 4, "3DSTATE_GS_SVB_INDEX" },
+		{ 0x790d, 3, 3, "3DSTATE_MULTISAMPLE" },
+		{ 0x7910, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+		{ 0x7805, 3, 3, "3DSTATE_URB" },
+		{ 0x7815, 5, 5, "3DSTATE_CONSTANT_VS_STATE" },
+		{ 0x7816, 5, 5, "3DSTATE_CONSTANT_GS_STATE" },
+		{ 0x7817, 5, 5, "3DSTATE_CONSTANT_PS_STATE" },
+		{ 0x7818, 2, 2, "3DSTATE_SAMPLE_MASK" },
+	};
+	uint32_t *data = kgem->batch + offset;
+	uint32_t op;
+	unsigned int len;
+	int i;
+	const char *desc1 = NULL;
+
+	len = (data[0] & 0xff) + 2;
+	op = (data[0] & 0xffff0000) >> 16;
+	switch (op) {
+	case 0x6000:
+		assert(len == 3);
+
+		kgem_debug_print(data, offset, 0, "URB_FENCE: %s%s%s%s%s%s\n",
+			  (data[0] >> 13) & 1 ? "cs " : "",
+			  (data[0] >> 12) & 1 ? "vfe " : "",
+			  (data[0] >> 11) & 1 ? "sf " : "",
+			  (data[0] >> 10) & 1 ? "clip " : "",
+			  (data[0] >> 9)  & 1 ? "gs " : "",
+			  (data[0] >> 8)  & 1 ? "vs " : "");
+		kgem_debug_print(data, offset, 1,
+			  "vs fence: %d, gs_fence: %d, clip_fence: %d\n",
+			  data[1] & 0x3ff,
+			  (data[1] >> 10) & 0x3ff,
+			  (data[1] >> 20) & 0x3ff);
+		kgem_debug_print(data, offset, 2,
+			  "sf fence: %d, vfe_fence: %d, cs_fence: %d\n",
+			   data[2] & 0x3ff,
+			   (data[2] >> 10) & 0x3ff,
+			   (data[2] >> 20) & 0x7ff);
+		return len;
+
+	case 0x6001:
+		kgem_debug_print(data, offset, 0, "CS_URB_STATE\n");
+		kgem_debug_print(data, offset, 1, "entry_size: %d [%d bytes], n_entries: %d\n",
+			  (data[1] >> 4) & 0x1f,
+			  (((data[1] >> 4) & 0x1f) + 1) * 64,
+			  data[1] & 0x7);
+		return len;
+	case 0x6002:
+		kgem_debug_print(data, offset, 0, "CONSTANT_BUFFER: %s\n",
+			  (data[0] >> 8) & 1 ? "valid" : "invalid");
+		kgem_debug_print(data, offset, 1, "offset: 0x%08x, length: %d bytes\n",
+			  data[1] & ~0x3f, ((data[1] & 0x3f) + 1) * 64);
+		return len;
+	case 0x6101:
+		i = 0;
+		kgem_debug_print(data, offset, i++, "STATE_BASE_ADDRESS\n");
+		assert(len == 6);
+
+		state_base_out(data, offset, i++, "general");
+		state_base_out(data, offset, i++, "surface");
+		state_base_out(data, offset, i++, "media");
+
+		state_max_out(data, offset, i++, "general");
+		state_max_out(data, offset, i++, "media");
+
+		return len;
+
+	case 0x7801:
+		assert(len == 6);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_BINDING_TABLE_POINTERS\n");
+		kgem_debug_print(data, offset, 1, "VS binding table\n");
+		kgem_debug_print(data, offset, 2, "GS binding table\n");
+		kgem_debug_print(data, offset, 3, "CLIP binding table\n");
+		kgem_debug_print(data, offset, 4, "SF binding table\n");
+		kgem_debug_print(data, offset, 5, "WM binding table\n");
+
+		return len;
+
+	case 0x7808:
+		assert((len - 1) % 4 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+		for (i = 1; i < len;) {
+			gen4_update_vertex_buffer(kgem, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %s, pitch %db\n",
+				  data[i] >> 27,
+				  data[i] & (1 << 20) ? "random" : "sequential",
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i++, "buffer address\n");
+			kgem_debug_print(data, offset, i++, "max index\n");
+			kgem_debug_print(data, offset, i++, "mbz\n");
+		}
+		return len;
+
+	case 0x7809:
+		assert((len + 1) % 2 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+		memset(state.ve, 0, sizeof(state.ve)); /* XXX? */
+		for (i = 1; i < len;) {
+			gen4_update_vertex_elements(kgem, (i - 1)/2, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %svalid, type 0x%04x, "
+				  "src offset 0x%04x bytes\n",
+				  data[i] >> 27,
+				  data[i] & (1 << 26) ? "" : "in",
+				  (data[i] >> 16) & 0x1ff,
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
+				  "dst offset 0x%02x bytes\n",
+				  get_965_element_component(data[i], 0),
+				  get_965_element_component(data[i], 1),
+				  get_965_element_component(data[i], 2),
+				  get_965_element_component(data[i], 3),
+				  (data[i] & 0xff) * 4);
+			i++;
+		}
+		state.num_ve = (len - 1) / 2; /* XXX? */
+		return len;
+
+	case 0x780a:
+		assert(len == 3);
+		kgem_debug_print(data, offset, 0, "3DSTATE_INDEX_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "beginning buffer address\n");
+		kgem_debug_print(data, offset, 2, "ending buffer address\n");
+		return len;
+
+	case 0x7900:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DRAWING_RECTANGLE\n");
+		kgem_debug_print(data, offset, 1, "top left: %d,%d\n",
+			  data[1] & 0xffff,
+			  (data[1] >> 16) & 0xffff);
+		kgem_debug_print(data, offset, 2, "bottom right: %d,%d\n",
+			  data[2] & 0xffff,
+			  (data[2] >> 16) & 0xffff);
+		kgem_debug_print(data, offset, 3, "origin: %d,%d\n",
+			  (int)data[3] & 0xffff,
+			  ((int)data[3] >> 16) & 0xffff);
+		return len;
+
+	case 0x7905:
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DEPTH_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
+			  get_965_surfacetype(data[1] >> 29),
+			  get_965_depthformat((data[1] >> 18) & 0x7),
+			  (data[1] & 0x0001ffff) + 1,
+			  data[1] & (1 << 27) ? "" : "not ",
+			  (data[1] & (1 << 22)) != 0,
+			  (data[1] & (1 << 21)) != 0);
+		kgem_debug_print(data, offset, 2, "depth offset\n");
+		kgem_debug_print(data, offset, 3, "%dx%d\n",
+			  ((data[3] & 0x0007ffc0) >> 6) + 1,
+			  ((data[3] & 0xfff80000) >> 19) + 1);
+		kgem_debug_print(data, offset, 4, "volume depth\n");
+		kgem_debug_print(data, offset, 5, "\n");
+		kgem_debug_print(data, offset, 6, "\n");
+		return len;
+
+	case 0x7a00:
+		assert(len == 4 || len == 5);
+		switch ((data[1] >> 14) & 0x3) {
+		case 0: desc1 = "no write"; break;
+		case 1: desc1 = "qword write"; break;
+		case 2: desc1 = "PS_DEPTH_COUNT write"; break;
+		case 3: desc1 = "TIMESTAMP write"; break;
+		}
+		kgem_debug_print(data, offset, 0, "PIPE_CONTROL\n");
+		kgem_debug_print(data, offset, 1,
+			  "%s, %scs stall, %stlb invalidate, "
+			  "%ssync gfdt, %sdepth stall, %sRC write flush, "
+			  "%sinst flush, %sTC flush\n",
+			  desc1,
+			  data[1] & (1 << 20) ? "" : "no ",
+			  data[1] & (1 << 18) ? "" : "no ",
+			  data[1] & (1 << 17) ? "" : "no ",
+			  data[1] & (1 << 13) ? "" : "no ",
+			  data[1] & (1 << 12) ? "" : "no ",
+			  data[1] & (1 << 11) ? "" : "no ",
+			  data[1] & (1 << 10) ? "" : "no ");
+		if (len == 5) {
+			kgem_debug_print(data, offset, 2, "destination address\n");
+			kgem_debug_print(data, offset, 3, "immediate dword low\n");
+			kgem_debug_print(data, offset, 4, "immediate dword high\n");
+		} else {
+			for (i = 2; i < len; i++) {
+				kgem_debug_print(data, offset, i, "\n");
+			}
+		}
+		return len;
+
+	case 0x7b00:
+		assert(len == 6);
+		kgem_debug_print(data, offset, 0,
+			  "3DPRIMITIVE: %s %s\n",
+			  get_965_prim_type(data[0]),
+			  (data[0] & (1 << 15)) ? "random" : "sequential");
+		kgem_debug_print(data, offset, 1, "vertex count\n");
+		kgem_debug_print(data, offset, 2, "start vertex\n");
+		kgem_debug_print(data, offset, 3, "instance count\n");
+		kgem_debug_print(data, offset, 4, "start instance\n");
+		kgem_debug_print(data, offset, 5, "index bias\n");
+		primitive_out(kgem, data);
+		return len;
+	}
+
+	/* For the rest, just dump the bytes */
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++)
+		if (op == opcodes[i].opcode)
+			break;
+
+	assert(i < ARRAY_SIZE(opcodes));
+
+	len = 1;
+	kgem_debug_print(data, offset, 0, "%s\n", opcodes[i].name);
+	if (opcodes[i].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		assert(len >= opcodes[i].min_len &&
+		       len <= opcodes[i].max_len);
+	}
+
+	for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+	return len;
+}
+
+void kgem_gen4_finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
diff --git a/cogl/driver/drm/kgem_debug_gen5.c b/cogl/driver/drm/kgem_debug_gen5.c
new file mode 100644
index 00000000..e23ceb1f
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen5.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+
+#include "gen5_render.h"
+
+#include "kgem_debug.h"
+
+static struct state {
+	struct vertex_buffer {
+		int handle;
+		void *base;
+		int size;
+		const char *ptr;
+		int pitch;
+
+		struct kgem_bo *current;
+	} vb[17];
+	struct vertex_elements {
+		int buffer;
+		int offset;
+		bool valid;
+		uint32_t type;
+		uint8_t swizzle[4];
+	} ve[17];
+	int num_ve;
+
+	struct dynamic_state {
+		struct kgem_bo *current;
+		void *base, *ptr;
+	} dynamic_state;
+} state;
+
+static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
+{
+	struct drm_i915_gem_relocation_entry *reloc;
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i, size;
+
+	reloc = kgem_debug_get_reloc_entry(kgem, &data[1] - kgem->batch);
+	if (reloc->target_handle == 0) {
+		base = kgem->batch;
+		size = kgem->nbatch * sizeof(uint32_t);
+	} else {
+		bo = kgem_debug_get_bo_for_reloc_entry(kgem, reloc);
+		base = kgem_bo_map__debug(kgem, bo);
+		size = kgem_bo_size(bo);
+	}
+	ptr = (char *)base + reloc->delta;
+
+	i = data[0] >> 27;
+
+	state.vb[i].handle = reloc->target_handle;
+	state.vb[i].current = bo;
+	state.vb[i].base = base;
+	state.vb[i].ptr = ptr;
+	state.vb[i].pitch = data[0] & 0x7ff;
+	state.vb[i].size = size;
+}
+
+static uint32_t
+get_ve_component(uint32_t data, int component)
+{
+	return (data >> (16 + (3 - component) * 4)) & 0x7;
+}
+
+static void gen5_update_vertex_elements(struct kgem *kgem, int id, const uint32_t *data)
+{
+	state.ve[id].buffer = data[0] >> 27;
+	state.ve[id].valid = !!(data[0] & (1 << 26));
+	state.ve[id].type = (data[0] >> 16) & 0x1ff;
+	state.ve[id].offset = data[0] & 0x7ff;
+	state.ve[id].swizzle[0] = get_ve_component(data[1], 0);
+	state.ve[id].swizzle[1] = get_ve_component(data[1], 1);
+	state.ve[id].swizzle[2] = get_ve_component(data[1], 2);
+	state.ve[id].swizzle[3] = get_ve_component(data[1], 3);
+}
+
+static void vertices_sint16_out(const struct vertex_elements *ve, const int16_t *v, int max)
+{
+	int c, o;
+
+	ErrorF("(");
+	for (c = o = 0; c < 4 && o < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%d", v[o++]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (o < max)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void vertices_float_out(const struct vertex_elements *ve, const float *f, int max)
+{
+	int c, o;
+
+	ErrorF("(");
+	for (c = o = 0; c < 4 && o < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%f", f[o++]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (o < max)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void ve_out(const struct vertex_elements *ve, const void *ptr)
+{
+	switch (ve->type) {
+	case GEN5_SURFACEFORMAT_R32_FLOAT:
+		vertices_float_out(ve, ptr, 1);
+		break;
+	case GEN5_SURFACEFORMAT_R32G32_FLOAT:
+		vertices_float_out(ve, ptr, 2);
+		break;
+	case GEN5_SURFACEFORMAT_R32G32B32_FLOAT:
+		vertices_float_out(ve, ptr, 3);
+		break;
+	case GEN5_SURFACEFORMAT_R32G32B32A32_FLOAT:
+		vertices_float_out(ve, ptr, 4);
+		break;
+	case GEN5_SURFACEFORMAT_R16_SINT:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN5_SURFACEFORMAT_R16G16_SINT:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN5_SURFACEFORMAT_R16G16B16A16_SINT:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	case GEN5_SURFACEFORMAT_R16_SSCALED:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN5_SURFACEFORMAT_R16G16_SSCALED:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN5_SURFACEFORMAT_R16G16B16A16_SSCALED:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	}
+}
+
+static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
+{
+	int i = 1;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const struct vertex_buffer *vb = &state.vb[ve->buffer];
+		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
+
+		if (!ve->valid)
+			continue;
+
+		assert(vb->pitch);
+		assert(ve->offset + v*vb->pitch < vb->size);
+
+		ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+}
+
+static void primitive_out(struct kgem *kgem, uint32_t *data)
+{
+	int n;
+
+	assert((data[0] & (1<<15)) == 0); /* XXX index buffers */
+
+	for (n = 0; n < data[1]; n++) {
+		int v = data[2] + n;
+		ErrorF("	[%d:%d] = ", n, v);
+		indirect_vertex_out(kgem, v);
+		ErrorF("\n");
+	}
+}
+
+static void
+state_base_out(uint32_t *data, uint32_t offset, unsigned int index,
+	       const char *name)
+{
+    if (data[index] & 1)
+	kgem_debug_print(data, offset, index,
+		  "%s state base address 0x%08x\n",
+		  name, data[index] & ~1);
+    else
+	kgem_debug_print(data, offset, index,
+		  "%s state base not updated\n",
+		  name);
+}
+
+static void
+state_max_out(uint32_t *data, uint32_t offset, unsigned int index,
+	      const char *name)
+{
+	if (data[index] == 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound disabled\n", name);
+	else if (data[index] & 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound 0x%08x\n",
+			  name, data[index] & ~1);
+	else
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound not updated\n",
+			  name);
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+	switch (surfacetype) {
+	case 0: return "1D";
+	case 1: return "2D";
+	case 2: return "3D";
+	case 3: return "CUBE";
+	case 4: return "BUFFER";
+	case 7: return "NULL";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+	switch (depthformat) {
+	case 0: return "s8_z24float";
+	case 1: return "z32float";
+	case 2: return "z24s8";
+	case 5: return "z16";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+	uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+	switch (component_control) {
+	case 0:
+		return "nostore";
+	case 1:
+		switch (component) {
+		case 0: return "X";
+		case 1: return "Y";
+		case 2: return "Z";
+		case 3: return "W";
+		default: return "fail";
+		}
+	case 2:
+		return "0.0";
+	case 3:
+		return "1.0";
+	case 4:
+		return "0x1";
+	case 5:
+		return "VID";
+	default:
+		return "fail";
+	}
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+	uint32_t primtype = (data >> 10) & 0x1f;
+
+	switch (primtype) {
+	case 0x01: return "point list";
+	case 0x02: return "line list";
+	case 0x03: return "line strip";
+	case 0x04: return "tri list";
+	case 0x05: return "tri strip";
+	case 0x06: return "tri fan";
+	case 0x07: return "quad list";
+	case 0x08: return "quad strip";
+	case 0x09: return "line list adj";
+	case 0x0a: return "line strip adj";
+	case 0x0b: return "tri list adj";
+	case 0x0c: return "tri strip adj";
+	case 0x0d: return "tri strip reverse";
+	case 0x0e: return "polygon";
+	case 0x0f: return "rect list";
+	case 0x10: return "line loop";
+	case 0x11: return "point list bf";
+	case 0x12: return "line strip cont";
+	case 0x13: return "line strip bf";
+	case 0x14: return "line strip cont bf";
+	case 0x15: return "tri fan no stipple";
+	default: return "fail";
+	}
+}
+
+#if 0
+struct reloc {
+	struct kgem_bo *bo;
+	void *base;
+};
+
+static void *
+get_reloc(struct kgem *kgem,
+	  void *base, const uint32_t *reloc,
+	  struct reloc *r)
+{
+	uint32_t delta = *reloc;
+
+	memset(r, 0, sizeof(*r));
+
+	if (base == 0) {
+		uint32_t handle = sizeof(uint32_t) * (reloc - kgem->batch);
+		struct kgem_bo *bo = NULL;
+		int i;
+
+		for (i = 0; i < kgem->nreloc; i++)
+			if (kgem->reloc[i].offset == handle)
+				break;
+		assert(i < kgem->nreloc);
+		handle = kgem->reloc[i].target_handle;
+		delta = kgem->reloc[i].delta;
+
+		if (handle == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == handle)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map(kgem, bo, PROT_READ);
+			r->bo = bo;
+			r->base = base;
+		}
+	}
+
+	return (char *)base + delta;
+}
+#endif
+
+int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x6000, 3, 3, "URB_FENCE" },
+		{ 0x6001, 2, 2, "CS_URB_FENCE" },
+		{ 0x6002, 2, 2, "CONSTANT_BUFFER" },
+		{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+		{ 0x6102, 2, 2 , "STATE_SIP" },
+		{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x680b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x6904, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x7800, 7, 7, "3DSTATE_PIPELINED_POINTERS" },
+		{ 0x7801, 6, 6, "3DSTATE_BINDING_TABLE_POINTERS" },
+		{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
+		{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
+		{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+		{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+		{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
+		{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
+		{ 0x7906, 2, 2, "3DSTATE_POLY_STIPPLE_OFFSET" },
+		{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
+		{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
+		{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+		{ 0x7909, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+		{ 0x790b, 4, 4, "3DSTATE_GS_SVB_INDEX" },
+		{ 0x790d, 3, 3, "3DSTATE_MULTISAMPLE" },
+		{ 0x7910, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+		{ 0x7805, 3, 3, "3DSTATE_URB" },
+		{ 0x7815, 5, 5, "3DSTATE_CONSTANT_VS_STATE" },
+		{ 0x7816, 5, 5, "3DSTATE_CONSTANT_GS_STATE" },
+		{ 0x7817, 5, 5, "3DSTATE_CONSTANT_PS_STATE" },
+		{ 0x7818, 2, 2, "3DSTATE_SAMPLE_MASK" },
+	};
+	uint32_t *data = kgem->batch + offset;
+	uint32_t op;
+	unsigned int len;
+	int i;
+	const char *desc1 = NULL;
+
+	len = (data[0] & 0xff) + 2;
+	op = (data[0] & 0xffff0000) >> 16;
+	switch (op) {
+	case 0x6000:
+		assert(len == 3);
+
+		kgem_debug_print(data, offset, 0, "URB_FENCE: %s%s%s%s%s%s\n",
+			  (data[0] >> 13) & 1 ? "cs " : "",
+			  (data[0] >> 12) & 1 ? "vfe " : "",
+			  (data[0] >> 11) & 1 ? "sf " : "",
+			  (data[0] >> 10) & 1 ? "clip " : "",
+			  (data[0] >> 9)  & 1 ? "gs " : "",
+			  (data[0] >> 8)  & 1 ? "vs " : "");
+		kgem_debug_print(data, offset, 1,
+			  "vs fence: %d, gs_fence: %d, clip_fence: %d\n",
+			  data[1] & 0x3ff,
+			  (data[1] >> 10) & 0x3ff,
+			  (data[1] >> 20) & 0x3ff);
+		kgem_debug_print(data, offset, 2,
+			  "sf fence: %d, vfe_fence: %d, cs_fence: %d\n",
+			   data[2] & 0x3ff,
+			   (data[2] >> 10) & 0x3ff,
+			   (data[2] >> 20) & 0x7ff);
+		return len;
+
+	case 0x6001:
+		kgem_debug_print(data, offset, 0, "CS_URB_STATE\n");
+		kgem_debug_print(data, offset, 1, "entry_size: %d [%d bytes], n_entries: %d\n",
+			  (data[1] >> 4) & 0x1f,
+			  (((data[1] >> 4) & 0x1f) + 1) * 64,
+			  data[1] & 0x7);
+		return len;
+	case 0x6002:
+		kgem_debug_print(data, offset, 0, "CONSTANT_BUFFER: %s\n",
+			  (data[0] >> 8) & 1 ? "valid" : "invalid");
+		kgem_debug_print(data, offset, 1, "offset: 0x%08x, length: %d bytes\n",
+			  data[1] & ~0x3f, ((data[1] & 0x3f) + 1) * 64);
+		return len;
+	case 0x6101:
+		i = 0;
+		kgem_debug_print(data, offset, i++, "STATE_BASE_ADDRESS\n");
+		assert(len == 8);
+
+		state_base_out(data, offset, i++, "general");
+		state_base_out(data, offset, i++, "surface");
+		state_base_out(data, offset, i++, "media");
+		state_base_out(data, offset, i++, "instruction");
+
+		state_max_out(data, offset, i++, "general");
+		state_max_out(data, offset, i++, "media");
+		state_max_out(data, offset, i++, "instruction");
+
+		return len;
+
+	case 0x7801:
+		assert(len == 6);
+
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_BINDING_TABLE_POINTERS\n");
+		kgem_debug_print(data, offset, 1, "VS binding table\n");
+		kgem_debug_print(data, offset, 2, "GS binding table\n");
+		kgem_debug_print(data, offset, 3, "CLIP binding table\n");
+		kgem_debug_print(data, offset, 4, "SF binding table\n");
+		kgem_debug_print(data, offset, 5, "WM binding table\n");
+
+		return len;
+
+	case 0x7808:
+		assert((len - 1) % 4 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+		for (i = 1; i < len;) {
+			gen5_update_vertex_buffer(kgem, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %s, pitch %db\n",
+				  data[i] >> 27,
+				  data[i] & (1 << 20) ? "random" : "sequential",
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i++, "buffer address\n");
+			kgem_debug_print(data, offset, i++, "max index\n");
+			kgem_debug_print(data, offset, i++, "mbz\n");
+		}
+		return len;
+
+	case 0x7809:
+		assert((len + 1) % 2 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+		memset(state.ve, 0, sizeof(state.ve)); /* XXX? */
+		for (i = 1; i < len;) {
+			gen5_update_vertex_elements(kgem, (i - 1)/2, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %svalid, type 0x%04x, "
+				  "src offset 0x%04x bytes\n",
+				  data[i] >> 27,
+				  data[i] & (1 << 26) ? "" : "in",
+				  (data[i] >> 16) & 0x1ff,
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
+				  "dst offset 0x%02x bytes\n",
+				  get_965_element_component(data[i], 0),
+				  get_965_element_component(data[i], 1),
+				  get_965_element_component(data[i], 2),
+				  get_965_element_component(data[i], 3),
+				  (data[i] & 0xff) * 4);
+			i++;
+		}
+		state.num_ve = (len - 1) / 2; /* XXX? */
+		return len;
+
+	case 0x780a:
+		assert(len == 3);
+		kgem_debug_print(data, offset, 0, "3DSTATE_INDEX_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "beginning buffer address\n");
+		kgem_debug_print(data, offset, 2, "ending buffer address\n");
+		return len;
+
+	case 0x7900:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DRAWING_RECTANGLE\n");
+		kgem_debug_print(data, offset, 1, "top left: %d,%d\n",
+			  data[1] & 0xffff,
+			  (data[1] >> 16) & 0xffff);
+		kgem_debug_print(data, offset, 2, "bottom right: %d,%d\n",
+			  data[2] & 0xffff,
+			  (data[2] >> 16) & 0xffff);
+		kgem_debug_print(data, offset, 3, "origin: %d,%d\n",
+			  (int)data[3] & 0xffff,
+			  ((int)data[3] >> 16) & 0xffff);
+		return len;
+
+	case 0x7905:
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DEPTH_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
+			  get_965_surfacetype(data[1] >> 29),
+			  get_965_depthformat((data[1] >> 18) & 0x7),
+			  (data[1] & 0x0001ffff) + 1,
+			  data[1] & (1 << 27) ? "" : "not ",
+			  (data[1] & (1 << 22)) != 0,
+			  (data[1] & (1 << 21)) != 0);
+		kgem_debug_print(data, offset, 2, "depth offset\n");
+		kgem_debug_print(data, offset, 3, "%dx%d\n",
+			  ((data[3] & 0x0007ffc0) >> 6) + 1,
+			  ((data[3] & 0xfff80000) >> 19) + 1);
+		kgem_debug_print(data, offset, 4, "volume depth\n");
+		kgem_debug_print(data, offset, 5, "\n");
+		kgem_debug_print(data, offset, 6, "\n");
+		return len;
+
+	case 0x7a00:
+		assert(len == 4 || len == 5);
+		switch ((data[1] >> 14) & 0x3) {
+		case 0: desc1 = "no write"; break;
+		case 1: desc1 = "qword write"; break;
+		case 2: desc1 = "PS_DEPTH_COUNT write"; break;
+		case 3: desc1 = "TIMESTAMP write"; break;
+		}
+		kgem_debug_print(data, offset, 0, "PIPE_CONTROL\n");
+		kgem_debug_print(data, offset, 1,
+			  "%s, %scs stall, %stlb invalidate, "
+			  "%ssync gfdt, %sdepth stall, %sRC write flush, "
+			  "%sinst flush, %sTC flush\n",
+			  desc1,
+			  data[1] & (1 << 20) ? "" : "no ",
+			  data[1] & (1 << 18) ? "" : "no ",
+			  data[1] & (1 << 17) ? "" : "no ",
+			  data[1] & (1 << 13) ? "" : "no ",
+			  data[1] & (1 << 12) ? "" : "no ",
+			  data[1] & (1 << 11) ? "" : "no ",
+			  data[1] & (1 << 10) ? "" : "no ");
+		if (len == 5) {
+			kgem_debug_print(data, offset, 2, "destination address\n");
+			kgem_debug_print(data, offset, 3, "immediate dword low\n");
+			kgem_debug_print(data, offset, 4, "immediate dword high\n");
+		} else {
+			for (i = 2; i < len; i++) {
+				kgem_debug_print(data, offset, i, "\n");
+			}
+		}
+		return len;
+
+	case 0x7b00:
+		assert(len == 6);
+		kgem_debug_print(data, offset, 0,
+			  "3DPRIMITIVE: %s %s\n",
+			  get_965_prim_type(data[0]),
+			  (data[0] & (1 << 15)) ? "random" : "sequential");
+		kgem_debug_print(data, offset, 1, "vertex count\n");
+		kgem_debug_print(data, offset, 2, "start vertex\n");
+		kgem_debug_print(data, offset, 3, "instance count\n");
+		kgem_debug_print(data, offset, 4, "start instance\n");
+		kgem_debug_print(data, offset, 5, "index bias\n");
+		primitive_out(kgem, data);
+		return len;
+	}
+
+	/* For the rest, just dump the bytes */
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++)
+		if (op == opcodes[i].opcode)
+			break;
+
+	assert(i < ARRAY_SIZE(opcodes));
+
+	len = 1;
+	kgem_debug_print(data, offset, 0, "%s\n", opcodes[i].name);
+	if (opcodes[i].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		assert(len >= opcodes[i].min_len &&
+		       len <= opcodes[i].max_len);
+	}
+
+	for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+	return len;
+}
+
+void kgem_gen5_finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
diff --git a/cogl/driver/drm/kgem_debug_gen6.c b/cogl/driver/drm/kgem_debug_gen6.c
new file mode 100644
index 00000000..e0b09d55
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen6.c
@@ -0,0 +1,1075 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris"chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+#include "gen6_render.h"
+
+#include "kgem_debug.h"
+
+static struct state {
+	struct vertex_buffer {
+		int handle;
+		const char *ptr;
+		int pitch;
+
+		struct kgem_bo *current;
+	} vb[33];
+	struct vertex_elements {
+		int buffer;
+		int offset;
+		bool valid;
+		uint32_t type;
+		uint8_t swizzle[4];
+	} ve[33];
+	int num_ve;
+
+	struct dynamic_state {
+		struct kgem_bo *current;
+		void *base, *ptr;
+	} dynamic_state;
+} state;
+
+static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
+{
+	uint32_t reloc = sizeof(uint32_t) * (&data[1] - kgem->batch);
+	struct kgem_bo *bo = NULL;
+	void *base;
+	int i;
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == reloc)
+			break;
+	assert(i < kgem->nreloc);
+	reloc = kgem->reloc[i].target_handle;
+
+	if (reloc == 0) {
+		base = kgem->batch;
+	} else {
+		list_for_each_entry(bo, &kgem->next_request->buffers, request)
+			if (bo->handle == reloc)
+				break;
+		assert(&bo->request != &kgem->next_request->buffers);
+		base = kgem_bo_map__debug(kgem, bo);
+	}
+
+	base = (char *)base + kgem->reloc[i].delta;
+	i = data[0] >> 26;
+
+	state.vb[i].current = bo;
+	state.vb[i].ptr = base;
+	state.vb[i].pitch = data[0] & 0x7ff;
+}
+
+static void gen6_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
+{
+	uint32_t reloc = sizeof(uint32_t) * offset;
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i;
+
+	if ((kgem->batch[offset] & 1) == 0)
+		return;
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == reloc)
+			break;
+	if(i < kgem->nreloc) {
+		reloc = kgem->reloc[i].target_handle;
+
+		if (reloc == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == reloc)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map__debug(kgem, bo);
+		}
+		ptr = (char *)base + (kgem->reloc[i].delta & ~1);
+	} else {
+		bo = NULL;
+		base = NULL;
+		ptr = NULL;
+	}
+
+	state.dynamic_state.current = bo;
+	state.dynamic_state.base = base;
+	state.dynamic_state.ptr = ptr;
+}
+
+static uint32_t
+get_ve_component(uint32_t data, int component)
+{
+	return (data >> (16 + (3 - component) * 4)) & 0x7;
+}
+
+static void gen6_update_vertex_elements(struct kgem *kgem, int id, const uint32_t *data)
+{
+	state.ve[id].buffer = data[0] >> 26;
+	state.ve[id].valid = !!(data[0] & (1 << 25));
+	state.ve[id].type = (data[0] >> 16) & 0x1ff;
+	state.ve[id].offset = data[0] & 0x7ff;
+	state.ve[id].swizzle[0] = get_ve_component(data[1], 0);
+	state.ve[id].swizzle[1] = get_ve_component(data[1], 1);
+	state.ve[id].swizzle[2] = get_ve_component(data[1], 2);
+	state.ve[id].swizzle[3] = get_ve_component(data[1], 3);
+}
+
+static void gen6_update_sf_state(struct kgem *kgem, uint32_t *data)
+{
+	state.num_ve = 1 + ((data[1] >> 22) & 0x3f);
+}
+
+static void vertices_sint16_out(const struct vertex_elements *ve, const int16_t *v, int max)
+{
+	int c;
+
+	ErrorF("(");
+	for (c = 0; c < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%d", v[c]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void vertices_float_out(const struct vertex_elements *ve, const float *f, int max)
+{
+	int c, o;
+
+	ErrorF("(");
+	for (c = o = 0; c < 4 && o < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%f", f[o++]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void ve_out(const struct vertex_elements *ve, const void *ptr)
+{
+	switch (ve->type) {
+	case GEN6_SURFACEFORMAT_R32_FLOAT:
+		vertices_float_out(ve, ptr, 1);
+		break;
+	case GEN6_SURFACEFORMAT_R32G32_FLOAT:
+		vertices_float_out(ve, ptr, 2);
+		break;
+	case GEN6_SURFACEFORMAT_R32G32B32_FLOAT:
+		vertices_float_out(ve, ptr, 3);
+		break;
+	case GEN6_SURFACEFORMAT_R32G32B32A32_FLOAT:
+		vertices_float_out(ve, ptr, 4);
+		break;
+	case GEN6_SURFACEFORMAT_R16_SINT:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN6_SURFACEFORMAT_R16G16_SINT:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN6_SURFACEFORMAT_R16G16B16A16_SINT:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	case GEN6_SURFACEFORMAT_R16_SSCALED:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN6_SURFACEFORMAT_R16G16_SSCALED:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN6_SURFACEFORMAT_R16G16B16A16_SSCALED:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	}
+}
+
+static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
+{
+	int i = 1;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const struct vertex_buffer *vb = &state.vb[ve->buffer];
+		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
+
+		if (ve->valid)
+			ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+}
+
+static void primitive_out(struct kgem *kgem, uint32_t *data)
+{
+	int n;
+
+	assert((data[0] & (1<<15)) == 0); /* XXX index buffers */
+
+	for (n = 0; n < data[1]; n++) {
+		int v = data[2] + n;
+		ErrorF("	[%d:%d] = ", n, v);
+		indirect_vertex_out(kgem, v);
+		ErrorF("\n");
+	}
+}
+
+static void finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
+
+static void
+state_base_out(uint32_t *data, uint32_t offset, unsigned int index,
+	       const char *name)
+{
+    if (data[index] & 1)
+	kgem_debug_print(data, offset, index,
+		  "%s state base address 0x%08x\n",
+		  name, data[index] & ~1);
+    else
+	kgem_debug_print(data, offset, index,
+		  "%s state base not updated\n",
+		  name);
+}
+
+static void
+state_max_out(uint32_t *data, uint32_t offset, unsigned int index,
+	      const char *name)
+{
+	if (data[index] == 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound disabled\n", name);
+	else if (data[index] & 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound 0x%08x\n",
+			  name, data[index] & ~1);
+	else
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound not updated\n",
+			  name);
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+	switch (surfacetype) {
+	case 0: return "1D";
+	case 1: return "2D";
+	case 2: return "3D";
+	case 3: return "CUBE";
+	case 4: return "BUFFER";
+	case 7: return "NULL";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+	switch (depthformat) {
+	case 0: return "s8_z24float";
+	case 1: return "z32float";
+	case 2: return "z24s8";
+	case 5: return "z16";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+	uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+	switch (component_control) {
+	case 0:
+		return "nostore";
+	case 1:
+		switch (component) {
+		case 0: return "X";
+		case 1: return "Y";
+		case 2: return "Z";
+		case 3: return "W";
+		default: return "fail";
+		}
+	case 2:
+		return "0.0";
+	case 3:
+		return "1.0";
+	case 4:
+		return "0x1";
+	case 5:
+		return "VID";
+	default:
+		return "fail";
+	}
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+	uint32_t primtype = (data >> 10) & 0x1f;
+
+	switch (primtype) {
+	case 0x01: return "point list";
+	case 0x02: return "line list";
+	case 0x03: return "line strip";
+	case 0x04: return "tri list";
+	case 0x05: return "tri strip";
+	case 0x06: return "tri fan";
+	case 0x07: return "quad list";
+	case 0x08: return "quad strip";
+	case 0x09: return "line list adj";
+	case 0x0a: return "line strip adj";
+	case 0x0b: return "tri list adj";
+	case 0x0c: return "tri strip adj";
+	case 0x0d: return "tri strip reverse";
+	case 0x0e: return "polygon";
+	case 0x0f: return "rect list";
+	case 0x10: return "line loop";
+	case 0x11: return "point list bf";
+	case 0x12: return "line strip cont";
+	case 0x13: return "line strip bf";
+	case 0x14: return "line strip cont bf";
+	case 0x15: return "tri fan no stipple";
+	default: return "fail";
+	}
+}
+
+struct reloc {
+	struct kgem_bo *bo;
+	void *base;
+};
+
+static void *
+get_reloc(struct kgem *kgem,
+	  void *base, const uint32_t *reloc,
+	  struct reloc *r)
+{
+	uint32_t delta = *reloc;
+
+	memset(r, 0, sizeof(*r));
+
+	if (base == 0) {
+		uint32_t handle = sizeof(uint32_t) * (reloc - kgem->batch);
+		struct kgem_bo *bo = NULL;
+		int i;
+
+		for (i = 0; i < kgem->nreloc; i++)
+			if (kgem->reloc[i].offset == handle)
+				break;
+		assert(i < kgem->nreloc);
+		handle = kgem->reloc[i].target_handle;
+		delta = kgem->reloc[i].delta;
+
+		if (handle == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == handle)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map__debug(kgem, bo);
+			r->bo = bo;
+			r->base = base;
+		}
+	}
+
+	return (char *)base + (delta & ~3);
+}
+
+static const char *
+gen6_filter_to_string(uint32_t filter)
+{
+	switch (filter) {
+	default:
+	case GEN6_MAPFILTER_NEAREST: return "nearest";
+	case GEN6_MAPFILTER_LINEAR: return "linear";
+	}
+}
+
+static const char *
+gen6_repeat_to_string(uint32_t repeat)
+{
+	switch (repeat) {
+	default:
+	case GEN6_TEXCOORDMODE_CLAMP_BORDER: return "border";
+	case GEN6_TEXCOORDMODE_WRAP: return "wrap";
+	case GEN6_TEXCOORDMODE_CLAMP: return "clamp";
+	case GEN6_TEXCOORDMODE_MIRROR: return "mirror";
+	}
+}
+
+static void
+gen6_decode_sampler_state(struct kgem *kgem, const uint32_t *reloc)
+{
+	const struct gen6_sampler_state *ss;
+	struct reloc r;
+	const char *min, *mag;
+	const char *s_wrap, *t_wrap, *r_wrap;
+
+	ss = get_reloc(kgem, state.dynamic_state.ptr, reloc, &r);
+
+	min = gen6_filter_to_string(ss->ss0.min_filter);
+	mag = gen6_filter_to_string(ss->ss0.mag_filter);
+
+	s_wrap = gen6_repeat_to_string(ss->ss1.s_wrap_mode);
+	t_wrap = gen6_repeat_to_string(ss->ss1.t_wrap_mode);
+	r_wrap = gen6_repeat_to_string(ss->ss1.r_wrap_mode);
+
+	ErrorF("  Sampler 0:\n");
+	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
+	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
+
+	ss++;
+	min = gen6_filter_to_string(ss->ss0.min_filter);
+	mag = gen6_filter_to_string(ss->ss0.mag_filter);
+
+	s_wrap = gen6_repeat_to_string(ss->ss1.s_wrap_mode);
+	t_wrap = gen6_repeat_to_string(ss->ss1.t_wrap_mode);
+	r_wrap = gen6_repeat_to_string(ss->ss1.r_wrap_mode);
+
+	ErrorF("  Sampler 1:\n");
+	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
+	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
+}
+
+static const char *
+gen6_blend_factor_to_string(uint32_t v)
+{
+	switch (v) {
+#define C(x) case GEN6_BLENDFACTOR_##x: return #x;
+		C(ONE);
+		C(SRC_COLOR);
+		C(SRC_ALPHA);
+		C(DST_ALPHA);
+		C(DST_COLOR);
+		C(SRC_ALPHA_SATURATE);
+		C(CONST_COLOR);
+		C(CONST_ALPHA);
+		C(SRC1_COLOR);
+		C(SRC1_ALPHA);
+		C(ZERO);
+		C(INV_SRC_COLOR);
+		C(INV_SRC_ALPHA);
+		C(INV_DST_ALPHA);
+		C(INV_DST_COLOR);
+		C(INV_CONST_COLOR);
+		C(INV_CONST_ALPHA);
+		C(INV_SRC1_COLOR);
+		C(INV_SRC1_ALPHA);
+#undef C
+	default: return "???";
+	}
+}
+
+static const char *
+gen6_blend_function_to_string(uint32_t v)
+{
+	switch (v) {
+#define C(x) case GEN6_BLENDFUNCTION_##x: return #x;
+		C(ADD);
+		C(SUBTRACT);
+		C(REVERSE_SUBTRACT);
+		C(MIN);
+		C(MAX);
+#undef C
+	default: return "???";
+	}
+}
+
+static float unpack_float(uint32_t dw)
+{
+	union {
+		float f;
+		uint32_t dw;
+	} u;
+	u.dw = dw;
+	return u.f;
+}
+
+static void
+gen6_decode_blend(struct kgem *kgem, const uint32_t *reloc)
+{
+	const struct gen6_blend_state *blend;
+	struct reloc r;
+	const char *dst, *src;
+	const char *func;
+
+	blend = get_reloc(kgem, state.dynamic_state.ptr, reloc, &r);
+
+	dst = gen6_blend_factor_to_string(blend->blend0.dest_blend_factor);
+	src = gen6_blend_factor_to_string(blend->blend0.source_blend_factor);
+	func = gen6_blend_function_to_string(blend->blend0.blend_func);
+
+	ErrorF("  Blend (%s): function %s, src=%s, dst=%s\n",
+	       blend->blend0.blend_enable ? "enabled" : "disabled",
+	       func, src, dst);
+}
+
+int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+		{ 0x6102, 2, 2 , "STATE_SIP" },
+		{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x680b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x6904, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x7800, 7, 7, "3DSTATE_PIPELINED_POINTERS" },
+		{ 0x7801, 6, 6, "3DSTATE_BINDING_TABLE_POINTERS" },
+		{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
+		{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
+		{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+		{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
+		{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+		{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
+		{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
+		{ 0x7906, 2, 2, "3DSTATE_POLY_STIPPLE_OFFSET" },
+		{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
+		{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
+		{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+		{ 0x7909, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+		{ 0x790b, 4, 4, "3DSTATE_GS_SVB_INDEX" },
+		{ 0x790d, 3, 3, "3DSTATE_MULTISAMPLE" },
+		{ 0x7910, 2, 2, "3DSTATE_CLEAR_PARAMS" },
+		{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+		{ 0x7802, 4, 4, "3DSTATE_SAMPLER_STATE_POINTERS" },
+		{ 0x7805, 3, 3, "3DSTATE_URB" },
+		{ 0x780d, 4, 4, "3DSTATE_VIEWPORT_STATE_POINTERS" },
+		{ 0x780e, 4, 4, "3DSTATE_CC_STATE_POINTERS" },
+		{ 0x780f, 2, 2, "3DSTATE_SCISSOR_STATE_POINTERS" },
+		{ 0x7810, 6, 6, "3DSTATE_VS_STATE" },
+		{ 0x7811, 7, 7, "3DSTATE_GS_STATE" },
+		{ 0x7812, 4, 4, "3DSTATE_CLIP_STATE" },
+		{ 0x7813, 20, 20, "3DSTATE_SF_STATE" },
+		{ 0x7814, 9, 9, "3DSTATE_WM_STATE" },
+		{ 0x7815, 5, 5, "3DSTATE_CONSTANT_VS_STATE" },
+		{ 0x7816, 5, 5, "3DSTATE_CONSTANT_GS_STATE" },
+		{ 0x7817, 5, 5, "3DSTATE_CONSTANT_WM_STATE" },
+		{ 0x7818, 2, 2, "3DSTATE_SAMPLE_MASK" },
+	};
+	uint32_t *data = kgem->batch + offset;
+	uint32_t op;
+	unsigned int len;
+	int i, j;
+	const char *desc1 = NULL;
+
+	len = (data[0] & 0xff) + 2;
+	op = (data[0] & 0xffff0000) >> 16;
+	switch (op) {
+	case 0x6101:
+		i = 0;
+		kgem_debug_print(data, offset, i++, "STATE_BASE_ADDRESS\n");
+		if (kgem->gen >= 60) {
+			assert(len == 10);
+
+			state_base_out(data, offset, i++, "general");
+			state_base_out(data, offset, i++, "surface");
+			state_base_out(data, offset, i++, "dynamic");
+			state_base_out(data, offset, i++, "indirect");
+			state_base_out(data, offset, i++, "instruction");
+
+			state_max_out(data, offset, i++, "general");
+			state_max_out(data, offset, i++, "dynamic");
+			state_max_out(data, offset, i++, "indirect");
+			state_max_out(data, offset, i++, "instruction");
+
+			gen6_update_dynamic_buffer(kgem, offset + 3);
+		} else if (kgem->gen >= 50) {
+			assert(len == 8);
+
+			state_base_out(data, offset, i++, "general");
+			state_base_out(data, offset, i++, "surface");
+			state_base_out(data, offset, i++, "media");
+			state_base_out(data, offset, i++, "instruction");
+
+			state_max_out(data, offset, i++, "general");
+			state_max_out(data, offset, i++, "media");
+			state_max_out(data, offset, i++, "instruction");
+		}
+
+		return len;
+
+	case 0x7801:
+		if (kgem->gen >= 60) {
+			assert(len == 4);
+
+			kgem_debug_print(data, offset, 0,
+				  "3DSTATE_BINDING_TABLE_POINTERS: VS mod %d, "
+				  "GS mod %d, WM mod %d\n",
+				  (data[0] & (1 << 8)) != 0,
+				  (data[0] & (1 << 9)) != 0,
+				  (data[0] & (1 << 12)) != 0);
+			kgem_debug_print(data, offset, 1, "VS binding table\n");
+			kgem_debug_print(data, offset, 2, "GS binding table\n");
+			kgem_debug_print(data, offset, 3, "WM binding table\n");
+		} else if (kgem->gen >= 40) {
+			assert(len == 6);
+
+			kgem_debug_print(data, offset, 0,
+				  "3DSTATE_BINDING_TABLE_POINTERS\n");
+			kgem_debug_print(data, offset, 1, "VS binding table\n");
+			kgem_debug_print(data, offset, 2, "GS binding table\n");
+			kgem_debug_print(data, offset, 3, "CLIP binding table\n");
+			kgem_debug_print(data, offset, 4, "SF binding table\n");
+			kgem_debug_print(data, offset, 5, "WM binding table\n");
+		}
+
+		return len;
+
+	case 0x7802:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0, "3DSTATE_SAMPLER_STATE_POINTERS: VS mod %d, "
+			  "GS mod %d, WM mod %d\n",
+			  (data[0] & (1 << 8)) != 0,
+			  (data[0] & (1 << 9)) != 0,
+			  (data[0] & (1 << 12)) != 0);
+		kgem_debug_print(data, offset, 1, "VS sampler state\n");
+		kgem_debug_print(data, offset, 2, "GS sampler state\n");
+		kgem_debug_print(data, offset, 3, "WM sampler state\n");
+		gen6_decode_sampler_state(kgem, &data[3]);
+		return len;
+
+	case 0x7808:
+		assert((len - 1) % 4 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+		for (i = 1; i < len;) {
+			gen6_update_vertex_buffer(kgem, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %s, pitch %db\n",
+				  data[i] >> 26,
+				  data[i] & (1 << 20) ? "random" : "sequential",
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i++, "buffer address\n");
+			kgem_debug_print(data, offset, i++, "max index\n");
+			kgem_debug_print(data, offset, i++, "mbz\n");
+		}
+		return len;
+
+	case 0x7809:
+		assert((len + 1) % 2 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+		for (i = 1; i < len;) {
+			gen6_update_vertex_elements(kgem, (i - 1)/2, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %svalid, type 0x%04x, "
+				  "src offset 0x%04x bytes\n",
+				  data[i] >> 26,
+				  data[i] & (1 << 25) ? "" : "in",
+				  (data[i] >> 16) & 0x1ff,
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
+				  "dst offset 0x%02x bytes\n",
+				  get_965_element_component(data[i], 0),
+				  get_965_element_component(data[i], 1),
+				  get_965_element_component(data[i], 2),
+				  get_965_element_component(data[i], 3),
+				  (data[i] & 0xff) * 4);
+			i++;
+		}
+		return len;
+
+	case 0x780d:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VIEWPORT_STATE_POINTERS\n");
+		kgem_debug_print(data, offset, 1, "clip\n");
+		kgem_debug_print(data, offset, 2, "sf\n");
+		kgem_debug_print(data, offset, 3, "cc\n");
+		return len;
+
+	case 0x780a:
+		assert(len == 3);
+		kgem_debug_print(data, offset, 0, "3DSTATE_INDEX_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "beginning buffer address\n");
+		kgem_debug_print(data, offset, 2, "ending buffer address\n");
+		return len;
+
+	case 0x780e:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0, "3DSTATE_CC_STATE_POINTERS\n");
+		kgem_debug_print(data, offset, 1, "blend%s\n",
+				 data[1] & 1 ? " update" : "");
+		if (data[1] & 1)
+			gen6_decode_blend(kgem, data+1);
+		kgem_debug_print(data, offset, 2, "depth+stencil%s\n",
+				 data[2] & 1 ? " update" : "");
+		kgem_debug_print(data, offset, 3, "cc%s\n",
+				 data[3] & 1 ? " update" : "");
+		return len;
+
+	case 0x780f:
+		assert(len == 2);
+		kgem_debug_print(data, offset, 0, "3DSTATE_SCISSOR_POINTERS\n");
+		kgem_debug_print(data, offset, 1, "scissor rect offset\n");
+		return len;
+
+	case 0x7810:
+		assert(len == 6);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VS\n");
+		kgem_debug_print(data, offset, 1, "kernel pointer\n");
+		kgem_debug_print(data, offset, 2, "SPF=%d, VME=%d, Sampler Count %d, "
+			  "Binding table count %d\n",
+			  (data[2] >> 31) & 1,
+			  (data[2] >> 30) & 1,
+			  (data[2] >> 27) & 7,
+			  (data[2] >> 18) & 0xff);
+		kgem_debug_print(data, offset, 3, "scratch offset\n");
+		kgem_debug_print(data, offset, 4, "Dispatch GRF start %d, VUE read length %d, "
+			  "VUE read offset %d\n",
+			  (data[4] >> 20) & 0x1f,
+			  (data[4] >> 11) & 0x3f,
+			  (data[4] >> 4) & 0x3f);
+		kgem_debug_print(data, offset, 5, "Max Threads %d, Vertex Cache %sable, "
+			  "VS func %sable\n",
+			  ((data[5] >> 25) & 0x7f) + 1,
+			  (data[5] & (1 << 1)) != 0 ? "dis" : "en",
+			  (data[5] & 1) != 0 ? "en" : "dis");
+		return len;
+
+	case 0x7811:
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0, "3DSTATE_GS\n");
+		kgem_debug_print(data, offset, 1, "kernel pointer\n");
+		kgem_debug_print(data, offset, 2, "SPF=%d, VME=%d, Sampler Count %d, "
+			  "Binding table count %d\n",
+			  (data[2] >> 31) & 1,
+			  (data[2] >> 30) & 1,
+			  (data[2] >> 27) & 7,
+			  (data[2] >> 18) & 0xff);
+		kgem_debug_print(data, offset, 3, "scratch offset\n");
+		kgem_debug_print(data, offset, 4, "Dispatch GRF start %d, VUE read length %d, "
+			  "VUE read offset %d\n",
+			  (data[4] & 0xf),
+			  (data[4] >> 11) & 0x3f,
+			  (data[4] >> 4) & 0x3f);
+		kgem_debug_print(data, offset, 5, "Max Threads %d, Rendering %sable\n",
+			  ((data[5] >> 25) & 0x7f) + 1,
+			  (data[5] & (1 << 8)) != 0 ? "en" : "dis");
+		kgem_debug_print(data, offset, 6, "Reorder %sable, Discard Adjaceny %sable, "
+			  "GS %sable\n",
+			  (data[6] & (1 << 30)) != 0 ? "en" : "dis",
+			  (data[6] & (1 << 29)) != 0 ? "en" : "dis",
+			  (data[6] & (1 << 15)) != 0 ? "en" : "dis");
+		return len;
+
+	case 0x7812:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0, "3DSTATE_CLIP\n");
+		kgem_debug_print(data, offset, 1, "UserClip distance cull test mask 0x%x\n",
+			  data[1] & 0xff);
+		kgem_debug_print(data, offset, 2, "Clip %sable, API mode %s, Viewport XY test %sable, "
+			  "Viewport Z test %sable, Guardband test %sable, Clip mode %d, "
+			  "Perspective Divide %sable, Non-Perspective Barycentric %sable, "
+			  "Tri Provoking %d, Line Provoking %d, Trifan Provoking %d\n",
+			  (data[2] & (1 << 31)) != 0 ? "en" : "dis",
+			  (data[2] & (1 << 30)) != 0 ? "D3D" : "OGL",
+			  (data[2] & (1 << 28)) != 0 ? "en" : "dis",
+			  (data[2] & (1 << 27)) != 0 ? "en" : "dis",
+			  (data[2] & (1 << 26)) != 0 ? "en" : "dis",
+			  (data[2] >> 13) & 7,
+			  (data[2] & (1 << 9)) != 0 ? "dis" : "en",
+			  (data[2] & (1 << 8)) != 0 ? "en" : "dis",
+			  (data[2] >> 4) & 3,
+			  (data[2] >> 2) & 3,
+			  (data[2] & 3));
+		kgem_debug_print(data, offset, 3, "Min PointWidth %d, Max PointWidth %d, "
+			  "Force Zero RTAIndex %sable, Max VPIndex %d\n",
+			  (data[3] >> 17) & 0x7ff,
+			  (data[3] >> 6) & 0x7ff,
+			  (data[3] & (1 << 5)) != 0 ? "en" : "dis",
+			  (data[3] & 0xf));
+		return len;
+
+	case 0x7813:
+		gen6_update_sf_state(kgem, data);
+		assert(len == 20);
+		kgem_debug_print(data, offset, 0, "3DSTATE_SF\n");
+		kgem_debug_print(data, offset, 1, "Attrib Out %d, Attrib Swizzle %sable, VUE read length %d, "
+			  "VUE read offset %d\n",
+			  (data[1] >> 22) & 0x3f,
+			  (data[1] & (1 << 21)) != 0 ? "en" : "dis",
+			  (data[1] >> 11) & 0x1f,
+			  (data[1] >> 4) & 0x3f);
+		kgem_debug_print(data, offset, 2, "Legacy Global DepthBias %sable, FrontFace fill %d, BF fill %d, "
+			  "VP transform %sable, FrontWinding_%s\n",
+			  (data[2] & (1 << 11)) != 0 ? "en" : "dis",
+			  (data[2] >> 5) & 3,
+			  (data[2] >> 3) & 3,
+			  (data[2] & (1 << 1)) != 0 ? "en" : "dis",
+			  (data[2] & 1) != 0 ? "CCW" : "CW");
+		kgem_debug_print(data, offset, 3, "AA %sable, CullMode %d, Scissor %sable, Multisample m ode %d\n",
+			  (data[3] & (1 << 31)) != 0 ? "en" : "dis",
+			  (data[3] >> 29) & 3,
+			  (data[3] & (1 << 11)) != 0 ? "en" : "dis",
+			  (data[3] >> 8) & 3);
+		kgem_debug_print(data, offset, 4, "Last Pixel %sable, SubPixel Precision %d, Use PixelWidth %d\n",
+			  (data[4] & (1 << 31)) != 0 ? "en" : "dis",
+			  (data[4] & (1 << 12)) != 0 ? 4 : 8,
+			  (data[4] & (1 << 11)) != 0);
+		kgem_debug_print(data, offset, 5, "Global Depth Offset Constant %f\n", unpack_float(data[5]));
+		kgem_debug_print(data, offset, 6, "Global Depth Offset Scale %f\n", unpack_float(data[6]));
+		kgem_debug_print(data, offset, 7, "Global Depth Offset Clamp %f\n", unpack_float(data[7]));
+		for (i = 0, j = 0; i < 8; i++, j+=2)
+			kgem_debug_print(data, offset, i+8, "Attrib %d (Override %s%s%s%s, Const Source %d, Swizzle Select %d, "
+				  "Source %d); Attrib %d (Override %s%s%s%s, Const Source %d, Swizzle Select %d, Source %d)\n",
+				  j+1,
+				  (data[8+i] & (1 << 31)) != 0 ? "W":"",
+				  (data[8+i] & (1 << 30)) != 0 ? "Z":"",
+				  (data[8+i] & (1 << 29)) != 0 ? "Y":"",
+				  (data[8+i] & (1 << 28)) != 0 ? "X":"",
+				  (data[8+i] >> 25) & 3, (data[8+i] >> 22) & 3,
+				  (data[8+i] >> 16) & 0x1f,
+				  j,
+				  (data[8+i] & (1 << 15)) != 0 ? "W":"",
+				  (data[8+i] & (1 << 14)) != 0 ? "Z":"",
+				  (data[8+i] & (1 << 13)) != 0 ? "Y":"",
+				  (data[8+i] & (1 << 12)) != 0 ? "X":"",
+				  (data[8+i] >> 9) & 3, (data[8+i] >> 6) & 3,
+				  (data[8+i] & 0x1f));
+		kgem_debug_print(data, offset, 16, "Point Sprite TexCoord Enable\n");
+		kgem_debug_print(data, offset, 17, "Const Interp Enable\n");
+		kgem_debug_print(data, offset, 18, "Attrib 7-0 WrapShortest Enable\n");
+		kgem_debug_print(data, offset, 19, "Attrib 15-8 WrapShortest Enable\n");
+
+		return len;
+
+	case 0x7814:
+		assert(len == 9);
+		kgem_debug_print(data, offset, 0, "3DSTATE_WM\n");
+		kgem_debug_print(data, offset, 1, "kernel start pointer 0\n");
+		kgem_debug_print(data, offset, 2, "SPF=%d, VME=%d, Sampler Count %d, "
+			  "Binding table count %d\n",
+			  (data[2] >> 31) & 1,
+			  (data[2] >> 30) & 1,
+			  (data[2] >> 27) & 7,
+			  (data[2] >> 18) & 0xff);
+		kgem_debug_print(data, offset, 3, "scratch offset\n");
+		kgem_debug_print(data, offset, 4, "Depth Clear %d, Depth Resolve %d, HiZ Resolve %d, "
+			  "Dispatch GRF start[0] %d, start[1] %d, start[2] %d\n",
+			  (data[4] & (1 << 30)) != 0,
+			  (data[4] & (1 << 28)) != 0,
+			  (data[4] & (1 << 27)) != 0,
+			  (data[4] >> 16) & 0x7f,
+			  (data[4] >> 8) & 0x7f,
+			  (data[4] & 0x7f));
+		kgem_debug_print(data, offset, 5, "MaxThreads %d, PS KillPixel %d, PS computed Z %d, "
+			  "PS use sourceZ %d, Thread Dispatch %d, PS use sourceW %d, Dispatch32 %d, "
+			  "Dispatch16 %d, Dispatch8 %d\n",
+			  ((data[5] >> 25) & 0x7f) + 1,
+			  (data[5] & (1 << 22)) != 0,
+			  (data[5] & (1 << 21)) != 0,
+			  (data[5] & (1 << 20)) != 0,
+			  (data[5] & (1 << 19)) != 0,
+			  (data[5] & (1 << 8)) != 0,
+			  (data[5] & (1 << 2)) != 0,
+			  (data[5] & (1 << 1)) != 0,
+			  (data[5] & (1 << 0)) != 0);
+		kgem_debug_print(data, offset, 6, "Num SF output %d, Pos XY offset %d, ZW interp mode %d , "
+			  "Barycentric interp mode 0x%x, Point raster rule %d, Multisample mode %d, "
+			  "Multisample Dispatch mode %d\n",
+			  (data[6] >> 20) & 0x3f,
+			  (data[6] >> 18) & 3,
+			  (data[6] >> 16) & 3,
+			  (data[6] >> 10) & 0x3f,
+			  (data[6] & (1 << 9)) != 0,
+			  (data[6] >> 1) & 3,
+			  (data[6] & 1));
+		kgem_debug_print(data, offset, 7, "kernel start pointer 1\n");
+		kgem_debug_print(data, offset, 8, "kernel start pointer 2\n");
+
+		return len;
+
+	case 0x7900:
+		assert(len == 4);
+		kgem_debug_print(data, offset, 0,
+				 "3DSTATE_DRAWING_RECTANGLE\n");
+		kgem_debug_print(data, offset, 1, "top left: %d, %d\n",
+				 (uint16_t)(data[1] & 0xffff),
+				 (uint16_t)(data[1] >> 16));
+		kgem_debug_print(data, offset, 2, "bottom right: %d, %d\n",
+				 (uint16_t)(data[2] & 0xffff),
+				 (uint16_t)(data[2] >> 16));
+		kgem_debug_print(data, offset, 3, "origin: %d, %d\n",
+				 (int16_t)(data[3] & 0xffff),
+				 (int16_t)(data[3] >> 16));
+		return len;
+
+	case 0x7905:
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0,
+			  "3DSTATE_DEPTH_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "%s, %s, pitch = %d bytes, %stiled, HiZ %d, Seperate Stencil %d\n",
+			  get_965_surfacetype(data[1] >> 29),
+			  get_965_depthformat((data[1] >> 18) & 0x7),
+			  (data[1] & 0x0001ffff) + 1,
+			  data[1] & (1 << 27) ? "" : "not ",
+			  (data[1] & (1 << 22)) != 0,
+			  (data[1] & (1 << 21)) != 0);
+		kgem_debug_print(data, offset, 2, "depth offset\n");
+		kgem_debug_print(data, offset, 3, "%dx%d\n",
+			  ((data[3] & 0x0007ffc0) >> 6) + 1,
+			  ((data[3] & 0xfff80000) >> 19) + 1);
+		kgem_debug_print(data, offset, 4, "volume depth\n");
+		kgem_debug_print(data, offset, 5, "\n");
+		kgem_debug_print(data, offset, 6, "\n");
+		return len;
+
+	case 0x7a00:
+		assert(len == 4 || len == 5);
+		switch ((data[1] >> 14) & 0x3) {
+		case 0: desc1 = "no write"; break;
+		case 1: desc1 = "qword write"; break;
+		case 2: desc1 = "PS_DEPTH_COUNT write"; break;
+		case 3: desc1 = "TIMESTAMP write"; break;
+		}
+		kgem_debug_print(data, offset, 0, "PIPE_CONTROL\n");
+		kgem_debug_print(data, offset, 1,
+			  "%s, %scs stall, %stlb invalidate, "
+			  "%ssync gfdt, %sdepth stall, %sRC write flush, "
+			  "%sinst flush, %sTC flush\n",
+			  desc1,
+			  data[1] & (1 << 20) ? "" : "no ",
+			  data[1] & (1 << 18) ? "" : "no ",
+			  data[1] & (1 << 17) ? "" : "no ",
+			  data[1] & (1 << 13) ? "" : "no ",
+			  data[1] & (1 << 12) ? "" : "no ",
+			  data[1] & (1 << 11) ? "" : "no ",
+			  data[1] & (1 << 10) ? "" : "no ");
+		if (len == 5) {
+			kgem_debug_print(data, offset, 2, "destination address\n");
+			kgem_debug_print(data, offset, 3, "immediate dword low\n");
+			kgem_debug_print(data, offset, 4, "immediate dword high\n");
+		} else {
+			for (i = 2; i < len; i++) {
+				kgem_debug_print(data, offset, i, "\n");
+			}
+		}
+		return len;
+
+	case 0x7b00:
+		assert(len == 6);
+		kgem_debug_print(data, offset, 0,
+			  "3DPRIMITIVE: %s %s\n",
+			  get_965_prim_type(data[0]),
+			  (data[0] & (1 << 15)) ? "random" : "sequential");
+		kgem_debug_print(data, offset, 1, "vertex count\n");
+		kgem_debug_print(data, offset, 2, "start vertex\n");
+		kgem_debug_print(data, offset, 3, "instance count\n");
+		kgem_debug_print(data, offset, 4, "start instance\n");
+		kgem_debug_print(data, offset, 5, "index bias\n");
+		primitive_out(kgem, data);
+		return len;
+	}
+
+	/* For the rest, just dump the bytes */
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++)
+		if (op == opcodes[i].opcode)
+			break;
+
+	assert(i < ARRAY_SIZE(opcodes));
+
+	len = 1;
+	kgem_debug_print(data, offset, 0, "%s\n", opcodes[i].name);
+	if (opcodes[i].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		assert(len >= opcodes[i].min_len &&
+		       len <= opcodes[i].max_len);
+	}
+
+	for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+	return len;
+}
+
+void kgem_gen6_finish_state(struct kgem *kgem)
+{
+	finish_state(kgem);
+}
diff --git a/cogl/driver/drm/kgem_debug_gen7.c b/cogl/driver/drm/kgem_debug_gen7.c
new file mode 100644
index 00000000..1bc014bf
--- /dev/null
+++ b/cogl/driver/drm/kgem_debug_gen7.c
@@ -0,0 +1,716 @@
+/*
+ * Copyright © 2007-2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Chris Wilson <chris"chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "sna.h"
+#include "sna_reg.h"
+#include "gen7_render.h"
+
+#include "kgem_debug.h"
+
+static struct state {
+	struct vertex_buffer {
+		int handle;
+		void *base;
+		const char *ptr;
+		int pitch;
+
+		struct kgem_bo *current;
+	} vb[33];
+	struct vertex_elements {
+		int buffer;
+		int offset;
+		bool valid;
+		uint32_t type;
+		uint8_t swizzle[4];
+	} ve[33];
+	int num_ve;
+
+	struct dynamic_state {
+		struct kgem_bo *current;
+		void *base, *ptr;
+	} dynamic_state;
+} state;
+
+static void gen7_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
+{
+	uint32_t reloc = sizeof(uint32_t) * (&data[1] - kgem->batch);
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i;
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == reloc)
+			break;
+	assert(i < kgem->nreloc);
+	reloc = kgem->reloc[i].target_handle;
+
+	if (reloc == 0) {
+		base = kgem->batch;
+	} else {
+		list_for_each_entry(bo, &kgem->next_request->buffers, request)
+			if (bo->handle == reloc)
+				break;
+		assert(&bo->request != &kgem->next_request->buffers);
+		base = kgem_bo_map__debug(kgem, bo);
+	}
+	ptr = (char *)base + kgem->reloc[i].delta;
+
+	i = data[0] >> 26;
+
+	state.vb[i].current = bo;
+	state.vb[i].base = base;
+	state.vb[i].ptr = ptr;
+	state.vb[i].pitch = data[0] & 0x7ff;
+}
+
+static void gen7_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
+{
+	uint32_t reloc = sizeof(uint32_t) * offset;
+	struct kgem_bo *bo = NULL;
+	void *base, *ptr;
+	int i;
+
+	if ((kgem->batch[offset] & 1) == 0)
+		return;
+
+	for (i = 0; i < kgem->nreloc; i++)
+		if (kgem->reloc[i].offset == reloc)
+			break;
+	if(i < kgem->nreloc) {
+		reloc = kgem->reloc[i].target_handle;
+
+		if (reloc == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == reloc)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map__debug(kgem, bo);
+		}
+		ptr = (char *)base + (kgem->reloc[i].delta & ~1);
+	} else {
+		bo = NULL;
+		base = NULL;
+		ptr = NULL;
+	}
+
+	state.dynamic_state.current = bo;
+	state.dynamic_state.base = base;
+	state.dynamic_state.ptr = ptr;
+}
+
+static uint32_t
+get_ve_component(uint32_t data, int component)
+{
+	return (data >> (16 + (3 - component) * 4)) & 0x7;
+}
+
+static void gen7_update_vertex_elements(struct kgem *kgem, int id, const uint32_t *data)
+{
+	state.ve[id].buffer = data[0] >> 26;
+	state.ve[id].valid = !!(data[0] & (1 << 25));
+	state.ve[id].type = (data[0] >> 16) & 0x1ff;
+	state.ve[id].offset = data[0] & 0x7ff;
+	state.ve[id].swizzle[0] = get_ve_component(data[1], 0);
+	state.ve[id].swizzle[1] = get_ve_component(data[1], 1);
+	state.ve[id].swizzle[2] = get_ve_component(data[1], 2);
+	state.ve[id].swizzle[3] = get_ve_component(data[1], 3);
+}
+
+static void gen7_update_sf_state(struct kgem *kgem, uint32_t *data)
+{
+	state.num_ve = 1 + ((data[1] >> 22) & 0x3f);
+}
+
+static void vertices_sint16_out(const struct vertex_elements *ve, const int16_t *v, int max)
+{
+	int c;
+
+	ErrorF("(");
+	for (c = 0; c < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%d", v[c]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void vertices_float_out(const struct vertex_elements *ve, const float *f, int max)
+{
+	int c, o;
+
+	ErrorF("(");
+	for (c = o = 0; c < 4 && o < max; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("%f", f[o++]); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	for (; c < 4; c++) {
+		switch (ve->swizzle[c]) {
+		case 0: ErrorF("#"); break;
+		case 1: ErrorF("1.0"); break;
+		case 2: ErrorF("0.0"); break;
+		case 3: ErrorF("1.0"); break;
+		case 4: ErrorF("0x1"); break;
+		case 5: break;
+		default: ErrorF("?");
+		}
+		if (c < 3)
+			ErrorF(", ");
+	}
+	ErrorF(")");
+}
+
+static void ve_out(const struct vertex_elements *ve, const void *ptr)
+{
+	switch (ve->type) {
+	case GEN7_SURFACEFORMAT_R32_FLOAT:
+		vertices_float_out(ve, ptr, 1);
+		break;
+	case GEN7_SURFACEFORMAT_R32G32_FLOAT:
+		vertices_float_out(ve, ptr, 2);
+		break;
+	case GEN7_SURFACEFORMAT_R32G32B32_FLOAT:
+		vertices_float_out(ve, ptr, 3);
+		break;
+	case GEN7_SURFACEFORMAT_R32G32B32A32_FLOAT:
+		vertices_float_out(ve, ptr, 4);
+		break;
+	case GEN7_SURFACEFORMAT_R16_SINT:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN7_SURFACEFORMAT_R16G16_SINT:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN7_SURFACEFORMAT_R16G16B16A16_SINT:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	case GEN7_SURFACEFORMAT_R16_SSCALED:
+		vertices_sint16_out(ve, ptr, 1);
+		break;
+	case GEN7_SURFACEFORMAT_R16G16_SSCALED:
+		vertices_sint16_out(ve, ptr, 2);
+		break;
+	case GEN7_SURFACEFORMAT_R16G16B16A16_SSCALED:
+		vertices_sint16_out(ve, ptr, 4);
+		break;
+	}
+}
+
+static void indirect_vertex_out(struct kgem *kgem, uint32_t v)
+{
+	int i = 1;
+
+	do {
+		const struct vertex_elements *ve = &state.ve[i];
+		const struct vertex_buffer *vb = &state.vb[ve->buffer];
+		const void *ptr = vb->ptr + v * vb->pitch + ve->offset;
+
+		if (!ve->valid)
+			continue;
+
+		ve_out(ve, ptr);
+
+		while (++i <= state.num_ve && !state.ve[i].valid)
+			;
+
+		if (i <= state.num_ve)
+			ErrorF(", ");
+	} while (i <= state.num_ve);
+}
+
+static void primitive_out(struct kgem *kgem, uint32_t *data)
+{
+	int n;
+
+	assert((data[0] & (1<<15)) == 0); /* XXX index buffers */
+
+	for (n = 0; n < data[2]; n++) {
+		int v = data[3] + n;
+		ErrorF("	[%d:%d] = ", n, v);
+		indirect_vertex_out(kgem, v);
+		ErrorF("\n");
+	}
+}
+
+static void finish_state(struct kgem *kgem)
+{
+	memset(&state, 0, sizeof(state));
+}
+
+static void
+state_base_out(uint32_t *data, uint32_t offset, unsigned int index,
+	       const char *name)
+{
+    if (data[index] & 1)
+	kgem_debug_print(data, offset, index,
+		  "%s state base address 0x%08x\n",
+		  name, data[index] & ~1);
+    else
+	kgem_debug_print(data, offset, index,
+		  "%s state base not updated\n",
+		  name);
+}
+
+static void
+state_max_out(uint32_t *data, uint32_t offset, unsigned int index,
+	      const char *name)
+{
+	if (data[index] == 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound disabled\n", name);
+	else if (data[index] & 1)
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound 0x%08x\n",
+			  name, data[index] & ~1);
+	else
+		kgem_debug_print(data, offset, index,
+			  "%s state upper bound not updated\n",
+			  name);
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+	switch (surfacetype) {
+	case 0: return "1D";
+	case 1: return "2D";
+	case 2: return "3D";
+	case 3: return "CUBE";
+	case 4: return "BUFFER";
+	case 7: return "NULL";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+	switch (depthformat) {
+	case 0: return "s8_z24float";
+	case 1: return "z32float";
+	case 2: return "z24s8";
+	case 5: return "z16";
+	default: return "unknown";
+	}
+}
+
+static const char *
+get_element_component(uint32_t data, int component)
+{
+	uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+	switch (component_control) {
+	case 0:
+		return "nostore";
+	case 1:
+		switch (component) {
+		case 0: return "X";
+		case 1: return "Y";
+		case 2: return "Z";
+		case 3: return "W";
+		default: return "fail";
+		}
+	case 2:
+		return "0.0";
+	case 3:
+		return "1.0";
+	case 4:
+		return "0x1";
+	case 5:
+		return "VID";
+	default:
+		return "fail";
+	}
+}
+
+static const char *
+get_prim_type(uint32_t data)
+{
+	uint32_t primtype = data & 0x1f;
+
+	switch (primtype) {
+	case 0x01: return "point list";
+	case 0x02: return "line list";
+	case 0x03: return "line strip";
+	case 0x04: return "tri list";
+	case 0x05: return "tri strip";
+	case 0x06: return "tri fan";
+	case 0x07: return "quad list";
+	case 0x08: return "quad strip";
+	case 0x09: return "line list adj";
+	case 0x0a: return "line strip adj";
+	case 0x0b: return "tri list adj";
+	case 0x0c: return "tri strip adj";
+	case 0x0d: return "tri strip reverse";
+	case 0x0e: return "polygon";
+	case 0x0f: return "rect list";
+	case 0x10: return "line loop";
+	case 0x11: return "point list bf";
+	case 0x12: return "line strip cont";
+	case 0x13: return "line strip bf";
+	case 0x14: return "line strip cont bf";
+	case 0x15: return "tri fan no stipple";
+	default: return "fail";
+	}
+}
+
+struct reloc {
+	struct kgem_bo *bo;
+	void *base;
+};
+
+static void *
+get_reloc(struct kgem *kgem,
+	  void *base, const uint32_t *reloc,
+	  struct reloc *r)
+{
+	uint32_t delta = *reloc;
+
+	memset(r, 0, sizeof(*r));
+
+	if (base == 0) {
+		uint32_t handle = sizeof(uint32_t) * (reloc - kgem->batch);
+		struct kgem_bo *bo = NULL;
+		int i;
+
+		for (i = 0; i < kgem->nreloc; i++)
+			if (kgem->reloc[i].offset == handle)
+				break;
+		assert(i < kgem->nreloc);
+		handle = kgem->reloc[i].target_handle;
+		delta = kgem->reloc[i].delta;
+
+		if (handle == 0) {
+			base = kgem->batch;
+		} else {
+			list_for_each_entry(bo, &kgem->next_request->buffers, request)
+				if (bo->handle == handle)
+					break;
+			assert(&bo->request != &kgem->next_request->buffers);
+			base = kgem_bo_map__debug(kgem, bo);
+			r->bo = bo;
+			r->base = base;
+		}
+	}
+
+	return (char *)base + (delta & ~3);
+}
+
+static const char *
+gen7_filter_to_string(uint32_t filter)
+{
+	switch (filter) {
+	default:
+	case GEN7_MAPFILTER_NEAREST: return "nearest";
+	case GEN7_MAPFILTER_LINEAR: return "linear";
+	}
+}
+
+static const char *
+gen7_repeat_to_string(uint32_t repeat)
+{
+	switch (repeat) {
+	default:
+	case GEN7_TEXCOORDMODE_CLAMP_BORDER: return "border";
+	case GEN7_TEXCOORDMODE_WRAP: return "wrap";
+	case GEN7_TEXCOORDMODE_CLAMP: return "clamp";
+	case GEN7_TEXCOORDMODE_MIRROR: return "mirror";
+	}
+}
+
+static void
+gen7_decode_sampler_state(struct kgem *kgem, const uint32_t *reloc)
+{
+	const struct gen7_sampler_state *ss;
+	struct reloc r;
+	const char *min, *mag;
+	const char *s_wrap, *t_wrap, *r_wrap;
+
+	ss = get_reloc(kgem, state.dynamic_state.ptr, reloc, &r);
+
+	min = gen7_filter_to_string(ss->ss0.min_filter);
+	mag = gen7_filter_to_string(ss->ss0.mag_filter);
+
+	s_wrap = gen7_repeat_to_string(ss->ss3.s_wrap_mode);
+	t_wrap = gen7_repeat_to_string(ss->ss3.t_wrap_mode);
+	r_wrap = gen7_repeat_to_string(ss->ss3.r_wrap_mode);
+
+	ErrorF("  Sampler 0:\n");
+	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
+	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
+
+	ss++;
+	min = gen7_filter_to_string(ss->ss0.min_filter);
+	mag = gen7_filter_to_string(ss->ss0.mag_filter);
+
+	s_wrap = gen7_repeat_to_string(ss->ss3.s_wrap_mode);
+	t_wrap = gen7_repeat_to_string(ss->ss3.t_wrap_mode);
+	r_wrap = gen7_repeat_to_string(ss->ss3.r_wrap_mode);
+
+	ErrorF("  Sampler 1:\n");
+	ErrorF("    filter: min=%s, mag=%s\n", min, mag);
+	ErrorF("    wrap: s=%s, t=%s, r=%s\n", s_wrap, t_wrap, r_wrap);
+}
+
+static const char *
+gen7_blend_factor_to_string(uint32_t v)
+{
+	switch (v) {
+#define C(x) case GEN7_BLENDFACTOR_##x: return #x;
+		C(ONE);
+		C(SRC_COLOR);
+		C(SRC_ALPHA);
+		C(DST_ALPHA);
+		C(DST_COLOR);
+		C(SRC_ALPHA_SATURATE);
+		C(CONST_COLOR);
+		C(CONST_ALPHA);
+		C(SRC1_COLOR);
+		C(SRC1_ALPHA);
+		C(ZERO);
+		C(INV_SRC_COLOR);
+		C(INV_SRC_ALPHA);
+		C(INV_DST_ALPHA);
+		C(INV_DST_COLOR);
+		C(INV_CONST_COLOR);
+		C(INV_CONST_ALPHA);
+		C(INV_SRC1_COLOR);
+		C(INV_SRC1_ALPHA);
+#undef C
+	default: return "???";
+	}
+}
+
+static const char *
+gen7_blend_function_to_string(uint32_t v)
+{
+	switch (v) {
+#define C(x) case GEN7_BLENDFUNCTION_##x: return #x;
+		C(ADD);
+		C(SUBTRACT);
+		C(REVERSE_SUBTRACT);
+		C(MIN);
+		C(MAX);
+#undef C
+	default: return "???";
+	}
+}
+
+static void
+gen7_decode_blend(struct kgem *kgem, const uint32_t *reloc)
+{
+	const struct gen7_blend_state *blend;
+	struct reloc r;
+	const char *dst, *src;
+	const char *func;
+
+	blend = get_reloc(kgem, state.dynamic_state.ptr, reloc, &r);
+
+	dst = gen7_blend_factor_to_string(blend->blend0.dest_blend_factor);
+	src = gen7_blend_factor_to_string(blend->blend0.source_blend_factor);
+	func = gen7_blend_function_to_string(blend->blend0.blend_func);
+
+	ErrorF("  Blend (%s): function %s, src=%s, dst=%s\n",
+	       blend->blend0.blend_enable ? "enabled" : "disabled",
+	       func, src, dst);
+}
+
+int kgem_gen7_decode_3d(struct kgem *kgem, uint32_t offset)
+{
+	static const struct {
+		uint32_t opcode;
+		int min_len;
+		int max_len;
+		const char *name;
+	} opcodes[] = {
+		{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+		{ 0x6102, 2, 2 , "STATE_SIP" },
+		{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+		{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+		{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+	};
+	uint32_t *data = kgem->batch + offset;
+	uint32_t op;
+	unsigned int len;
+	int i;
+	const char *name;
+
+	len = (data[0] & 0xff) + 2;
+	op = (data[0] & 0xffff0000) >> 16;
+	switch (op) {
+	case 0x6101:
+		i = 0;
+		kgem_debug_print(data, offset, i++, "STATE_BASE_ADDRESS\n");
+		assert(len == 10);
+
+		state_base_out(data, offset, i++, "general");
+		state_base_out(data, offset, i++, "surface");
+		state_base_out(data, offset, i++, "dynamic");
+		state_base_out(data, offset, i++, "indirect");
+		state_base_out(data, offset, i++, "instruction");
+
+		state_max_out(data, offset, i++, "general");
+		state_max_out(data, offset, i++, "dynamic");
+		state_max_out(data, offset, i++, "indirect");
+		state_max_out(data, offset, i++, "instruction");
+
+		gen7_update_dynamic_buffer(kgem, offset + 3);
+
+		return len;
+
+	case 0x7808:
+		assert((len - 1) % 4 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+		for (i = 1; i < len;) {
+			gen7_update_vertex_buffer(kgem, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %s, pitch %db\n",
+				  data[i] >> 26,
+				  data[i] & (1 << 20) ? "random" : "sequential",
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i++, "buffer address\n");
+			kgem_debug_print(data, offset, i++, "max index\n");
+			kgem_debug_print(data, offset, i++, "mbz\n");
+		}
+		return len;
+
+	case 0x7809:
+		assert((len + 1) % 2 == 0);
+		kgem_debug_print(data, offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+		for (i = 1; i < len;) {
+			gen7_update_vertex_elements(kgem, (i - 1)/2, data + i);
+
+			kgem_debug_print(data, offset, i, "buffer %d: %svalid, type 0x%04x, "
+				  "src offset 0x%04x bytes\n",
+				  data[i] >> 26,
+				  data[i] & (1 << 25) ? "" : "in",
+				  (data[i] >> 16) & 0x1ff,
+				  data[i] & 0x07ff);
+			i++;
+			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
+				  "dst offset 0x%02x bytes\n",
+				  get_element_component(data[i], 0),
+				  get_element_component(data[i], 1),
+				  get_element_component(data[i], 2),
+				  get_element_component(data[i], 3),
+				  (data[i] & 0xff) * 4);
+			i++;
+		}
+		return len;
+
+	case 0x780a:
+		assert(len == 3);
+		kgem_debug_print(data, offset, 0, "3DSTATE_INDEX_BUFFER\n");
+		kgem_debug_print(data, offset, 1, "beginning buffer address\n");
+		kgem_debug_print(data, offset, 2, "ending buffer address\n");
+		return len;
+
+	case 0x7b00:
+		assert(len == 7);
+		kgem_debug_print(data, offset, 0, "3DPRIMITIVE\n");
+		kgem_debug_print(data, offset, 1, "type %s, %s\n",
+			  get_prim_type(data[1]),
+			  (data[1] & (1 << 15)) ? "random" : "sequential");
+		kgem_debug_print(data, offset, 2, "vertex count\n");
+		kgem_debug_print(data, offset, 3, "start vertex\n");
+		kgem_debug_print(data, offset, 4, "instance count\n");
+		kgem_debug_print(data, offset, 5, "start instance\n");
+		kgem_debug_print(data, offset, 6, "index bias\n");
+		primitive_out(kgem, data);
+		return len;
+	}
+
+	/* For the rest, just dump the bytes */
+	name = NULL;
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++)
+		if (op == opcodes[i].opcode) {
+			name = opcodes[i].name;
+			break;
+		}
+
+	len = (data[0] & 0xff) + 2;
+	if (name == NULL) {
+		kgem_debug_print(data, offset, 0, "unknown\n");
+	} else {
+		kgem_debug_print(data, offset, 0, "%s\n", opcodes[i].name);
+		if (opcodes[i].max_len > 1) {
+			assert(len >= opcodes[i].min_len &&
+					len <= opcodes[i].max_len);
+		}
+	}
+	for (i = 1; i < len; i++)
+		kgem_debug_print(data, offset, i, "dword %d\n", i);
+
+	return len;
+}
+
+void kgem_gen7_finish_state(struct kgem *kgem)
+{
+	finish_state(kgem);
+}
diff --git a/cogl/driver/drm/render_program/exa_sf.g4b b/cogl/driver/drm/render_program/exa_sf.g4b
new file mode 100644
index 00000000..223c9c9a
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_sf.g4b
@@ -0,0 +1,15 @@
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
+   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
+   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
+   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
+   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
+   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/cogl/driver/drm/render_program/exa_sf.g5b b/cogl/driver/drm/render_program/exa_sf.g5b
new file mode 100644
index 00000000..d1035aed
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_sf.g5b
@@ -0,0 +1,7 @@
+   { 0x00400031, 0x20c01fbd, 0x1069002c, 0x02100001 },
+   { 0x00400001, 0x206003be, 0x00690060, 0x00000000 },
+   { 0x00400040, 0x20e077bd, 0x00690080, 0x006940a0 },
+   { 0x00400041, 0x202077be, 0x006900e0, 0x000000c0 },
+   { 0x00400040, 0x20e077bd, 0x006900a0, 0x00694060 },
+   { 0x00400041, 0x204077be, 0x006900e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x648d0000, 0x8808c800 },
diff --git a/cogl/driver/drm/render_program/exa_sf_mask.g4b b/cogl/driver/drm/render_program/exa_sf_mask.g4b
new file mode 100644
index 00000000..be0a77b0
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_sf_mask.g4b
@@ -0,0 +1,15 @@
+   { 0x00400031, 0x20c01fbd, 0x0069002c, 0x01110001 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
+   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
+   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
+   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/cogl/driver/drm/render_program/exa_sf_mask.g5b b/cogl/driver/drm/render_program/exa_sf_mask.g5b
new file mode 100644
index 00000000..76a03f87
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_sf_mask.g5b
@@ -0,0 +1,7 @@
+   { 0x00400031, 0x20c01fbd, 0x1069002c, 0x02100001 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600040, 0x20e077bd, 0x008d0080, 0x008d40a0 },
+   { 0x00600041, 0x202077be, 0x008d00e0, 0x000000c0 },
+   { 0x00600040, 0x20e077bd, 0x008d00a0, 0x008d4060 },
+   { 0x00600041, 0x204077be, 0x008d00e0, 0x000000c8 },
+   { 0x00600031, 0x20001fbc, 0x648d0000, 0x8808c800 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca.g4b b/cogl/driver/drm/render_program/exa_wm_ca.g4b
new file mode 100644
index 00000000..372e8b26
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca.g5b b/cogl/driver/drm/render_program/exa_wm_ca.g5b
new file mode 100644
index 00000000..372e8b26
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca.g5b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca.g6b b/cogl/driver/drm/render_program/exa_wm_ca.g6b
new file mode 100644
index 00000000..521a5b64
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca.g6b
@@ -0,0 +1,4 @@
+   { 0x00800041, 0x21c077bd, 0x008d01c0, 0x008d02c0 },
+   { 0x00800041, 0x220077bd, 0x008d0200, 0x008d0300 },
+   { 0x00800041, 0x224077bd, 0x008d0240, 0x008d0340 },
+   { 0x00800041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g4b b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g4b
new file mode 100644
index 00000000..963d6760
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d0280 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g5b b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g5b
new file mode 100644
index 00000000..963d6760
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g5b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00802041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00802041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00802041, 0x228077bd, 0x008d0380, 0x008d0280 },
diff --git a/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g6b b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g6b
new file mode 100644
index 00000000..d5ab7e42
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_ca_srcalpha.g6b
@@ -0,0 +1,4 @@
+   { 0x00800041, 0x21c077bd, 0x008d02c0, 0x008d0280 },
+   { 0x00800041, 0x220077bd, 0x008d0300, 0x008d0280 },
+   { 0x00800041, 0x224077bd, 0x008d0340, 0x008d0280 },
+   { 0x00800041, 0x228077bd, 0x008d0380, 0x008d0280 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_affine.g4b b/cogl/driver/drm/render_program/exa_wm_mask_affine.g4b
new file mode 100644
index 00000000..14a54517
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x210077be, 0x008d03c0, 0x000000ac },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x214077be, 0x008d03c0, 0x000000bc },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_affine.g5b b/cogl/driver/drm/render_program/exa_wm_mask_affine.g5b
new file mode 100644
index 00000000..e265beed
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_affine.g5b
@@ -0,0 +1,4 @@
+   { 0x00802059, 0x200077bc, 0x000000a0, 0x008d0100 },
+   { 0x00802048, 0x210077be, 0x000000a4, 0x008d0140 },
+   { 0x00802059, 0x200077bc, 0x000000b0, 0x008d0100 },
+   { 0x00802048, 0x214077be, 0x000000b4, 0x008d0140 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_affine.g6b b/cogl/driver/drm/render_program/exa_wm_mask_affine.g6b
new file mode 100644
index 00000000..e4bef29e
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_affine.g6b
@@ -0,0 +1,4 @@
+   { 0x0060005a, 0x210077be, 0x00000100, 0x008d0040 },
+   { 0x0060005a, 0x212077be, 0x00000100, 0x008d0080 },
+   { 0x0060005a, 0x214077be, 0x00000110, 0x008d0040 },
+   { 0x0060005a, 0x216077be, 0x00000110, 0x008d0080 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_affine.g7b b/cogl/driver/drm/render_program/exa_wm_mask_affine.g7b
new file mode 100644
index 00000000..8d72599d
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_affine.g7b
@@ -0,0 +1,4 @@
+   { 0x0060005a, 0x290077bd, 0x00000100, 0x008d0040 },
+   { 0x0060005a, 0x292077bd, 0x00000100, 0x008d0080 },
+   { 0x0060005a, 0x294077bd, 0x00000110, 0x008d0040 },
+   { 0x0060005a, 0x296077bd, 0x00000110, 0x008d0080 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_projective.g4b b/cogl/driver/drm/render_program/exa_wm_mask_projective.g4b
new file mode 100644
index 00000000..78cb9aef
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000c0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000c4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000cc },
+   { 0x00600031, 0x21801fbd, 0x008d03c0, 0x01110001 },
+   { 0x00600031, 0x21a01fbd, 0x008d03e0, 0x01110001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000ac },
+   { 0x00802041, 0x210077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000bc },
+   { 0x00802041, 0x214077be, 0x008d03c0, 0x008d0180 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_projective.g5b b/cogl/driver/drm/render_program/exa_wm_mask_projective.g5b
new file mode 100644
index 00000000..c3574594
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_projective.g5b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000c0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000c4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000cc },
+   { 0x00600031, 0x21801fbd, 0x108d03c0, 0x02100001 },
+   { 0x00600031, 0x21a01fbd, 0x108d03e0, 0x02100001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000a0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000a4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000ac },
+   { 0x00802041, 0x210077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x000000b0 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x000000b4 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x000000bc },
+   { 0x00802041, 0x214077be, 0x008d03c0, 0x008d0180 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_projective.g6b b/cogl/driver/drm/render_program/exa_wm_mask_projective.g6b
new file mode 100644
index 00000000..dddcb4bd
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_projective.g6b
@@ -0,0 +1,12 @@
+   { 0x0060005a, 0x23c077bd, 0x00000120, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000120, 0x008d0080 },
+   { 0x01600038, 0x218003bd, 0x008d03c0, 0x00000000 },
+   { 0x01600038, 0x21a003bd, 0x008d03e0, 0x00000000 },
+   { 0x0060005a, 0x23c077bd, 0x00000100, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000100, 0x008d0080 },
+   { 0x00600041, 0x210077be, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x212077be, 0x008d03e0, 0x008d01a0 },
+   { 0x0060005a, 0x23c077bd, 0x00000110, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000110, 0x008d0080 },
+   { 0x00600041, 0x214077be, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x216077be, 0x008d03e0, 0x008d01a0 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_projective.g7b b/cogl/driver/drm/render_program/exa_wm_mask_projective.g7b
new file mode 100644
index 00000000..a2e9267b
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_projective.g7b
@@ -0,0 +1,12 @@
+   { 0x0060005a, 0x23c077bd, 0x00000120, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000120, 0x008d0080 },
+   { 0x01600038, 0x218003bd, 0x008d03c0, 0x00000000 },
+   { 0x01600038, 0x21a003bd, 0x008d03e0, 0x00000000 },
+   { 0x0060005a, 0x23c077bd, 0x00000100, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000100, 0x008d0080 },
+   { 0x00600041, 0x290077bd, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x292077bd, 0x008d03e0, 0x008d01a0 },
+   { 0x0060005a, 0x23c077bd, 0x00000110, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x00000110, 0x008d0080 },
+   { 0x00600041, 0x294077bd, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x296077bd, 0x008d03e0, 0x008d01a0 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g4b b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g4b
new file mode 100644
index 00000000..7db47ca4
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g4b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x23801c09, 0x00000000, 0x02520102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g5b b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g5b
new file mode 100644
index 00000000..472c2bbe
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g5b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x23801c09, 0x20000000, 0x0a2a0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g6b b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g6b
new file mode 100644
index 00000000..6d1eae93
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g6b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x23801cc9, 0x000000e0, 0x0a2a0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g7b b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g7b
new file mode 100644
index 00000000..fa36a59e
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_a.g7b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x28e00021, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x23801ca9, 0x000008e0, 0x0a2c0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g4b b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g4b
new file mode 100644
index 00000000..9026ee2a
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g4b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x22c01c09, 0x00000000, 0x02580102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g5b b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g5b
new file mode 100644
index 00000000..cb112d56
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g5b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x22c01c09, 0x20000000, 0x0a8a0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g6b b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g6b
new file mode 100644
index 00000000..e5630bd1
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g6b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x20e00022, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22c01cc9, 0x000000e0, 0x0a8a0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g7b b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g7b
new file mode 100644
index 00000000..01edf7d5
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_mask_sample_argb.g7b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x28e00021, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22c01ca9, 0x000008e0, 0x0a8c0102 },
diff --git a/cogl/driver/drm/render_program/exa_wm_noca.g4b b/cogl/driver/drm/render_program/exa_wm_noca.g4b
new file mode 100644
index 00000000..15063341
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_noca.g4b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_noca.g5b b/cogl/driver/drm/render_program/exa_wm_noca.g5b
new file mode 100644
index 00000000..15063341
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_noca.g5b
@@ -0,0 +1,4 @@
+   { 0x00802041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00802041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00802041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00802041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_noca.g6b b/cogl/driver/drm/render_program/exa_wm_noca.g6b
new file mode 100644
index 00000000..e77ea2dd
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_noca.g6b
@@ -0,0 +1,4 @@
+   { 0x00800041, 0x21c077bd, 0x008d01c0, 0x008d0380 },
+   { 0x00800041, 0x220077bd, 0x008d0200, 0x008d0380 },
+   { 0x00800041, 0x224077bd, 0x008d0240, 0x008d0380 },
+   { 0x00800041, 0x228077bd, 0x008d0280, 0x008d0380 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_affine.g4b b/cogl/driver/drm/render_program/exa_wm_src_affine.g4b
new file mode 100644
index 00000000..d30da873
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_affine.g4b
@@ -0,0 +1,8 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x204077be, 0x008d03c0, 0x0000006c },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x208077be, 0x008d03c0, 0x0000007c },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_affine.g5b b/cogl/driver/drm/render_program/exa_wm_src_affine.g5b
new file mode 100644
index 00000000..f526adf7
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_affine.g5b
@@ -0,0 +1,4 @@
+   { 0x00802059, 0x200077bc, 0x00000060, 0x008d0100 },
+   { 0x00802048, 0x204077be, 0x00000064, 0x008d0140 },
+   { 0x00802059, 0x200077bc, 0x00000070, 0x008d0100 },
+   { 0x00802048, 0x208077be, 0x00000074, 0x008d0140 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_affine.g6b b/cogl/driver/drm/render_program/exa_wm_src_affine.g6b
new file mode 100644
index 00000000..7035e6a5
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_affine.g6b
@@ -0,0 +1,4 @@
+   { 0x0060005a, 0x204077be, 0x000000c0, 0x008d0040 },
+   { 0x0060005a, 0x206077be, 0x000000c0, 0x008d0080 },
+   { 0x0060005a, 0x208077be, 0x000000d0, 0x008d0040 },
+   { 0x0060005a, 0x20a077be, 0x000000d0, 0x008d0080 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_affine.g7b b/cogl/driver/drm/render_program/exa_wm_src_affine.g7b
new file mode 100644
index 00000000..f545fba1
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_affine.g7b
@@ -0,0 +1,4 @@
+   { 0x0060005a, 0x284077bd, 0x000000c0, 0x008d0040 },
+   { 0x0060005a, 0x286077bd, 0x000000c0, 0x008d0080 },
+   { 0x0060005a, 0x288077bd, 0x000000d0, 0x008d0040 },
+   { 0x0060005a, 0x28a077bd, 0x000000d0, 0x008d0080 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_projective.g4b b/cogl/driver/drm/render_program/exa_wm_src_projective.g4b
new file mode 100644
index 00000000..198bab3e
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_projective.g4b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000008c },
+   { 0x00600031, 0x21801fbd, 0x008d03c0, 0x01110001 },
+   { 0x00600031, 0x21a01fbd, 0x008d03e0, 0x01110001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000006c },
+   { 0x00802041, 0x204077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000007c },
+   { 0x00802041, 0x208077be, 0x008d03c0, 0x008d0180 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_projective.g5b b/cogl/driver/drm/render_program/exa_wm_src_projective.g5b
new file mode 100644
index 00000000..ae3db8cd
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_projective.g5b
@@ -0,0 +1,16 @@
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000080 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000084 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000008c },
+   { 0x00600031, 0x21801fbd, 0x108d03c0, 0x02100001 },
+   { 0x00600031, 0x21a01fbd, 0x108d03e0, 0x02100001 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000060 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000064 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000006c },
+   { 0x00802041, 0x204077be, 0x008d03c0, 0x008d0180 },
+   { 0x00802041, 0x23c077bd, 0x008d0100, 0x00000070 },
+   { 0x00802041, 0x238077bd, 0x008d0140, 0x00000074 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x008d0380 },
+   { 0x00802040, 0x23c077bd, 0x008d03c0, 0x0000007c },
+   { 0x00802041, 0x208077be, 0x008d03c0, 0x008d0180 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_projective.g6b b/cogl/driver/drm/render_program/exa_wm_src_projective.g6b
new file mode 100644
index 00000000..8e39bffa
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_projective.g6b
@@ -0,0 +1,12 @@
+   { 0x0060005a, 0x23c077bd, 0x000000e0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000e0, 0x008d0080 },
+   { 0x01600038, 0x218003bd, 0x008d03c0, 0x00000000 },
+   { 0x01600038, 0x21a003bd, 0x008d03e0, 0x00000000 },
+   { 0x0060005a, 0x23c077bd, 0x000000c0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000c0, 0x008d0080 },
+   { 0x00600041, 0x204077be, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x206077be, 0x008d03e0, 0x008d01a0 },
+   { 0x0060005a, 0x23c077bd, 0x000000d0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000d0, 0x008d0080 },
+   { 0x00600041, 0x208077be, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x20a077be, 0x008d03e0, 0x008d01a0 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_projective.g7b b/cogl/driver/drm/render_program/exa_wm_src_projective.g7b
new file mode 100644
index 00000000..73727ffd
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_projective.g7b
@@ -0,0 +1,12 @@
+   { 0x0060005a, 0x23c077bd, 0x000000e0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000e0, 0x008d0080 },
+   { 0x01600038, 0x218003bd, 0x008d03c0, 0x00000000 },
+   { 0x01600038, 0x21a003bd, 0x008d03e0, 0x00000000 },
+   { 0x0060005a, 0x23c077bd, 0x000000c0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000c0, 0x008d0080 },
+   { 0x00600041, 0x284077bd, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x286077bd, 0x008d03e0, 0x008d01a0 },
+   { 0x0060005a, 0x23c077bd, 0x000000d0, 0x008d0040 },
+   { 0x0060005a, 0x23e077bd, 0x000000d0, 0x008d0080 },
+   { 0x00600041, 0x208077be, 0x008d03c0, 0x008d0180 },
+   { 0x00600041, 0x28a077bd, 0x008d03e0, 0x008d01a0 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_a.g4b b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g4b
new file mode 100644
index 00000000..5e5a11f9
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g4b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x01800031, 0x22801c09, 0x00000000, 0x02520001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_a.g5b b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g5b
new file mode 100644
index 00000000..0e4eebe2
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g5b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x01800031, 0x22801c09, 0x20000000, 0x0a2a0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_a.g6b b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g6b
new file mode 100644
index 00000000..0b4a955d
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g6b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22801cc9, 0x00000020, 0x0a2a0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_a.g7b b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g7b
new file mode 100644
index 00000000..73912b75
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_a.g7b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00007000 },
+   { 0x00600001, 0x28200021, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22801ca9, 0x00000820, 0x0a2c0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g4b b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g4b
new file mode 100644
index 00000000..a15e40a0
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g4b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x01800031, 0x21c01c09, 0x00000000, 0x02580001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g5b b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g5b
new file mode 100644
index 00000000..f8cb41ef
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g5b
@@ -0,0 +1,2 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x01800031, 0x21c01d29, 0x208d0000, 0x0a8a0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g6b b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g6b
new file mode 100644
index 00000000..8bfe8498
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g6b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x21c01cc9, 0x00000020, 0x0a8a0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g7b b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g7b
new file mode 100644
index 00000000..a282cf8f
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_argb.g7b
@@ -0,0 +1,3 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x28200021, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x21c01ca9, 0x00000820, 0x0a8c0001 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g4b b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g4b
new file mode 100644
index 00000000..c8dc47d7
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g4b
@@ -0,0 +1,5 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x0000e000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x01800031, 0x22001c09, 0x00000000, 0x02520001 },
+   { 0x01800031, 0x21c01c09, 0x00000000, 0x02520003 },
+   { 0x01800031, 0x22401c09, 0x00000000, 0x02520005 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g5b b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g5b
new file mode 100644
index 00000000..ce3670b9
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g5b
@@ -0,0 +1,5 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x0000e000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x01800031, 0x22001c09, 0x20000000, 0x0a2a0001 },
+   { 0x01800031, 0x21c01c09, 0x20000000, 0x0a2a0003 },
+   { 0x01800031, 0x22401c09, 0x20000000, 0x0a2a0005 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g6b b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g6b
new file mode 100644
index 00000000..0a22827e
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g6b
@@ -0,0 +1,5 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x0000e000 },
+   { 0x00600001, 0x20200022, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22001cc9, 0x00000020, 0x0a2a0001 },
+   { 0x02800031, 0x21c01cc9, 0x00000020, 0x0a2a0003 },
+   { 0x02800031, 0x22401cc9, 0x00000020, 0x0a2a0005 },
diff --git a/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g7b b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g7b
new file mode 100644
index 00000000..ddd6f365
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_src_sample_planar.g7b
@@ -0,0 +1,5 @@
+   { 0x00000201, 0x20080061, 0x00000000, 0x0000e000 },
+   { 0x00600001, 0x28200021, 0x008d0000, 0x00000000 },
+   { 0x02800031, 0x22001ca9, 0x00000820, 0x0a2c0001 },
+   { 0x02800031, 0x21c01ca9, 0x00000820, 0x0a2c0003 },
+   { 0x02800031, 0x22401ca9, 0x00000820, 0x0a2c0005 },
diff --git a/cogl/driver/drm/render_program/exa_wm_write.g4b b/cogl/driver/drm/render_program/exa_wm_write.g4b
new file mode 100644
index 00000000..92e7b248
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_write.g4b
@@ -0,0 +1,18 @@
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0280, 0x00000000 },
+   { 0x00601001, 0x20c003be, 0x008d01e0, 0x00000000 },
+   { 0x00601001, 0x20e003be, 0x008d0220, 0x00000000 },
+   { 0x00601001, 0x210003be, 0x008d0260, 0x00000000 },
+   { 0x00601001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_write.g5b b/cogl/driver/drm/render_program/exa_wm_write.g5b
new file mode 100644
index 00000000..aff2ce01
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_write.g5b
@@ -0,0 +1,6 @@
+   { 0x00802001, 0x304003be, 0x008d01c0, 0x00000000 },
+   { 0x00802001, 0x306003be, 0x008d0200, 0x00000000 },
+   { 0x00802001, 0x308003be, 0x008d0240, 0x00000000 },
+   { 0x00802001, 0x30a003be, 0x008d0280, 0x00000000 },
+   { 0x00600201, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x548d0000, 0x94084800 },
diff --git a/cogl/driver/drm/render_program/exa_wm_write.g6b b/cogl/driver/drm/render_program/exa_wm_write.g6b
new file mode 100644
index 00000000..3cb6bff3
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_write.g6b
@@ -0,0 +1,17 @@
+   { 0x00600001, 0x204003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d02a0, 0x00000000 },
+   { 0x05800031, 0x24001cc8, 0x00000040, 0x90019000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_write.g7b b/cogl/driver/drm/render_program/exa_wm_write.g7b
new file mode 100644
index 00000000..f31af518
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_write.g7b
@@ -0,0 +1,17 @@
+   { 0x00600001, 0x284003bd, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x286003bd, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x288003bd, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x28a003bd, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x28c003bd, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x28e003bd, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x290003bd, 0x008d0280, 0x00000000 },
+   { 0x00600001, 0x292003bd, 0x008d02a0, 0x00000000 },
+   { 0x05800031, 0x24001ca8, 0x00000840, 0x90031000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_xy.g4b b/cogl/driver/drm/render_program/exa_wm_xy.g4b
new file mode 100644
index 00000000..327fc29c
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_xy.g4b
@@ -0,0 +1,4 @@
+   { 0x00800040, 0x23c06d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x23806d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d03c0, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d0380, 0x00004024 },
diff --git a/cogl/driver/drm/render_program/exa_wm_xy.g5b b/cogl/driver/drm/render_program/exa_wm_xy.g5b
new file mode 100644
index 00000000..327fc29c
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_xy.g5b
@@ -0,0 +1,4 @@
+   { 0x00800040, 0x23c06d29, 0x00480028, 0x10101010 },
+   { 0x00800040, 0x23806d29, 0x0048002a, 0x11001100 },
+   { 0x00802040, 0x2100753d, 0x008d03c0, 0x00004020 },
+   { 0x00802040, 0x2140753d, 0x008d0380, 0x00004024 },
diff --git a/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g4b b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g4b
new file mode 100644
index 00000000..01f6e2b2
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g4b
@@ -0,0 +1,12 @@
+   { 0x00802040, 0x23007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00802041, 0x23007fbd, 0x008d0300, 0x3f94fdf4 },
+   { 0x00802040, 0x22c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00802040, 0x23407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80802048, 0x21c07fbd, 0x008d02c0, 0x3fcc49ba },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x00802048, 0x24007fbc, 0x008d02c0, 0xbf5020c5 },
+   { 0x80802048, 0x22007fbd, 0x008d0340, 0xbec8b439 },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80802048, 0x22407fbd, 0x008d0340, 0x40011687 },
+   { 0x00802001, 0x228003fd, 0x00000000, 0x3f800000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g5b b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g5b
new file mode 100644
index 00000000..01f6e2b2
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g5b
@@ -0,0 +1,12 @@
+   { 0x00802040, 0x23007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00802041, 0x23007fbd, 0x008d0300, 0x3f94fdf4 },
+   { 0x00802040, 0x22c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00802040, 0x23407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80802048, 0x21c07fbd, 0x008d02c0, 0x3fcc49ba },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x00802048, 0x24007fbc, 0x008d02c0, 0xbf5020c5 },
+   { 0x80802048, 0x22007fbd, 0x008d0340, 0xbec8b439 },
+   { 0x00802001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80802048, 0x22407fbd, 0x008d0340, 0x40011687 },
+   { 0x00802001, 0x228003fd, 0x00000000, 0x3f800000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g6b b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g6b
new file mode 100644
index 00000000..01ec5e50
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g6b
@@ -0,0 +1,12 @@
+   { 0x00800040, 0x23007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00800041, 0x23007fbd, 0x008d0300, 0x3f94fdf4 },
+   { 0x00800040, 0x22c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00800040, 0x23407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80800048, 0x21c07fbd, 0x008d02c0, 0x3fcc49ba },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x00800048, 0x24007fbc, 0x008d02c0, 0xbf5020c5 },
+   { 0x80800048, 0x22007fbd, 0x008d0340, 0xbec8b439 },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80800048, 0x22407fbd, 0x008d0340, 0x40011687 },
+   { 0x00800001, 0x228003fd, 0x00000000, 0x3f800000 },
diff --git a/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g7b b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g7b
new file mode 100644
index 00000000..01ec5e50
--- /dev/null
+++ b/cogl/driver/drm/render_program/exa_wm_yuv_rgb.g7b
@@ -0,0 +1,12 @@
+   { 0x00800040, 0x23007fbd, 0x008d0200, 0xbd808081 },
+   { 0x00800041, 0x23007fbd, 0x008d0300, 0x3f94fdf4 },
+   { 0x00800040, 0x22c07fbd, 0x008d01c0, 0xbf008084 },
+   { 0x00800040, 0x23407fbd, 0x008d0240, 0xbf008084 },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80800048, 0x21c07fbd, 0x008d02c0, 0x3fcc49ba },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x00800048, 0x24007fbc, 0x008d02c0, 0xbf5020c5 },
+   { 0x80800048, 0x22007fbd, 0x008d0340, 0xbec8b439 },
+   { 0x00800001, 0x240003bc, 0x008d0300, 0x00000000 },
+   { 0x80800048, 0x22407fbd, 0x008d0340, 0x40011687 },
+   { 0x00800001, 0x228003fd, 0x00000000, 0x3f800000 },
diff --git a/cogl/driver/drm/sna.h b/cogl/driver/drm/sna.h
new file mode 100644
index 00000000..a56c70a8
--- /dev/null
+++ b/cogl/driver/drm/sna.h
@@ -0,0 +1,829 @@
+/**************************************************************************
+
+Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
+Copyright © 2002 David Dawes
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   David Dawes <dawes@xfree86.org>
+ *
+ */
+
+#ifndef _SNA_H_
+#define _SNA_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdint.h>
+#include "compiler.h"
+
+#if 0
+#include <xorg-server.h>
+
+#include <xf86Crtc.h>
+#if XF86_CRTC_VERSION >= 5
+#define HAS_PIXMAP_SHARING 1
+#endif
+
+#include <xf86str.h>
+#include <windowstr.h>
+#include <glyphstr.h>
+#include <picturestr.h>
+#include <gcstruct.h>
+
+#include <pciaccess.h>
+
+#include <xf86drmMode.h>
+
+#include "../compat-api.h"
+
+#include <drm.h>
+#include <i915_drm.h>
+
+#ifdef HAVE_DRI2_H
+#include <dri2.h>
+#endif
+#endif
+
+#if HAVE_UDEV
+#include <libudev.h>
+#endif
+
+#if HAS_DEBUG_FULL
+#define DBG(x) ErrorF x
+#else
+#define DBG(x)
+#endif
+
+#define DEBUG_NO_RENDER 0
+#define DEBUG_NO_BLT 0
+
+#define DEBUG_FLUSH_BATCH 0
+#define DEBUG_FLUSH_SYNC 0
+
+#define TEST_ALL 0
+#define TEST_ACCEL (TEST_ALL || 0)
+#define TEST_BATCH (TEST_ALL || 0)
+#define TEST_BLT (TEST_ALL || 0)
+#define TEST_COMPOSITE (TEST_ALL || 0)
+#define TEST_DAMAGE (TEST_ALL || 0)
+#define TEST_GRADIENT (TEST_ALL || 0)
+#define TEST_GLYPHS (TEST_ALL || 0)
+#define TEST_IO (TEST_ALL || 0)
+#define TEST_KGEM (TEST_ALL || 0)
+#define TEST_RENDER (TEST_ALL || 0)
+
+//#include "intel_driver.h"
+//#include "intel_list.h"
+#include "kgem.h"
+//#include "sna_damage.h"
+#include "sna_render.h"
+//#include "fb/fb.h"
+
+#define SNA_CURSOR_X			64
+#define SNA_CURSOR_Y			SNA_CURSOR_X
+
+#if 0
+struct sna_pixmap {
+	PixmapPtr pixmap;
+	struct kgem_bo *gpu_bo, *cpu_bo;
+	struct sna_damage *gpu_damage, *cpu_damage;
+	void *ptr;
+
+	struct list list;
+
+	uint32_t stride;
+	uint32_t clear_color;
+
+	uint32_t flush;
+
+#define SOURCE_BIAS 4
+	uint16_t source_count;
+	uint8_t pinned :3;
+#define PIN_SCANOUT 0x1
+#define PIN_DRI 0x2
+#define PIN_PRIME 0x4
+	uint8_t mapped :1;
+	uint8_t shm :1;
+	uint8_t clear :1;
+	uint8_t undamaged :1;
+	uint8_t create :3;
+	uint8_t header :1;
+	uint8_t cpu :1;
+};
+#endif
+
+struct sna_glyph {
+	PicturePtr atlas;
+	pixman_image_t *image;
+	struct sna_coordinate coordinate;
+	uint16_t size, pos;
+};
+
+static inline PixmapPtr get_window_pixmap(WindowPtr window)
+{
+	return fbGetWindowPixmap(window);
+}
+
+static inline PixmapPtr get_drawable_pixmap(DrawablePtr drawable)
+{
+	if (drawable->type == DRAWABLE_PIXMAP)
+		return (PixmapPtr)drawable;
+	else
+		return get_window_pixmap((WindowPtr)drawable);
+}
+
+extern DevPrivateKeyRec sna_pixmap_key;
+
+constant static inline struct sna_pixmap *sna_pixmap(PixmapPtr pixmap)
+{
+	return ((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[1];
+}
+
+static inline struct sna_pixmap *sna_pixmap_from_drawable(DrawablePtr drawable)
+{
+	return sna_pixmap(get_drawable_pixmap(drawable));
+}
+
+struct sna_gc {
+	long changes;
+	long serial;
+
+	GCFuncs *old_funcs;
+	void *priv;
+};
+
+static inline struct sna_gc *sna_gc(GCPtr gc)
+{
+	return dixGetPrivateAddr(&gc->devPrivates, &sna_gc_key);
+}
+
+enum {
+	FLUSH_TIMER = 0,
+	THROTTLE_TIMER,
+	EXPIRE_TIMER,
+#if DEBUG_MEMORY
+	DEBUG_MEMORY_TIMER,
+#endif
+	NUM_TIMERS
+};
+
+struct sna {
+	//ScrnInfoPtr scrn;
+
+	unsigned flags;
+#define SNA_NO_THROTTLE		0x1
+#define SNA_NO_DELAYED_FLUSH	0x2
+#define SNA_NO_WAIT		0x4
+#define SNA_NO_FLIP		0x8
+#define SNA_TEAR_FREE		0x10
+#define SNA_FORCE_SHADOW	0x20
+
+	unsigned watch_flush;
+
+	struct timeval timer_tv;
+	uint32_t timer_expire[NUM_TIMERS];
+	uint16_t timer_active;
+
+	int vblank_interval;
+
+	//struct list flush_pixmaps;
+	//struct list active_pixmaps;
+
+	PixmapPtr front;
+	PixmapPtr freed_pixmap;
+
+	struct sna_mode {
+		drmModeResPtr kmode;
+
+		int shadow_active;
+		DamagePtr shadow_damage;
+		struct kgem_bo *shadow;
+		int shadow_flip;
+
+		//struct list outputs;
+		//struct list crtcs;
+	} mode;
+
+	struct sna_dri {
+		void *flip_pending;
+	} dri;
+
+	unsigned int tiling;
+#define SNA_TILING_FB		0x1
+#define SNA_TILING_2D		0x2
+#define SNA_TILING_3D		0x4
+#define SNA_TILING_ALL (~0)
+
+	EntityInfoPtr pEnt;
+	struct pci_device *PciInfo;
+	const struct intel_device_info *info;
+
+	ScreenBlockHandlerProcPtr BlockHandler;
+	ScreenWakeupHandlerProcPtr WakeupHandler;
+	CloseScreenProcPtr CloseScreen;
+
+	PicturePtr clear;
+	struct {
+		uint32_t fill_bo;
+		uint32_t fill_pixel;
+		uint32_t fill_alu;
+	} blt_state;
+	union {
+		struct gen2_render_state gen2;
+		struct gen3_render_state gen3;
+		struct gen4_render_state gen4;
+		struct gen5_render_state gen5;
+		struct gen6_render_state gen6;
+		struct gen7_render_state gen7;
+	} render_state;
+	uint32_t have_render;
+
+	bool dri_available;
+	bool dri_open;
+	char *deviceName;
+
+	/* Broken-out options. */
+	OptionInfoPtr Options;
+
+	/* Driver phase/state information */
+	bool suspended;
+
+#if HAVE_UDEV
+	struct udev_monitor *uevent_monitor;
+	InputHandlerProc uevent_handler;
+#endif
+
+	struct kgem kgem;
+	struct sna_render render;
+
+#if DEBUG_MEMORY
+	struct {
+	       int shadow_pixels_allocs;
+	       int cpu_bo_allocs;
+	       size_t shadow_pixels_bytes;
+	       size_t cpu_bo_bytes;
+	} debug_memory;
+#endif
+};
+
+bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna);
+void sna_mode_adjust_frame(struct sna *sna, int x, int y);
+extern void sna_mode_update(struct sna *sna);
+extern void sna_mode_disable_unused(struct sna *sna);
+extern void sna_mode_wakeup(struct sna *sna);
+extern void sna_mode_redisplay(struct sna *sna);
+extern void sna_mode_fini(struct sna *sna);
+
+extern int sna_page_flip(struct sna *sna,
+			 struct kgem_bo *bo,
+			 void *data,
+			 int ref_crtc_hw_id);
+
+constant static inline struct sna *
+to_sna(ScrnInfoPtr scrn)
+{
+	return (struct sna *)(scrn->driverPrivate);
+}
+
+constant static inline struct sna *
+to_sna_from_screen(ScreenPtr screen)
+{
+	return to_sna(xf86ScreenToScrn(screen));
+}
+
+constant static inline struct sna *
+to_sna_from_pixmap(PixmapPtr pixmap)
+{
+	return ((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[0];
+}
+
+constant static inline struct sna *
+to_sna_from_drawable(DrawablePtr drawable)
+{
+	return to_sna_from_screen(drawable->pScreen);
+}
+
+static inline struct sna *
+to_sna_from_kgem(struct kgem *kgem)
+{
+	return container_of(kgem, struct sna, kgem);
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+#endif
+
+#ifndef ALIGN
+#define ALIGN(i,m)	(((i) + (m) - 1) & ~((m) - 1))
+#endif
+
+#ifndef MIN
+#define MIN(a,b)	((a) <= (b) ? (a) : (b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b)	((a) >= (b) ? (a) : (b))
+#endif
+
+extern xf86CrtcPtr sna_covering_crtc(ScrnInfoPtr scrn,
+				     const BoxRec *box,
+				     xf86CrtcPtr desired);
+
+extern bool sna_wait_for_scanline(struct sna *sna, PixmapPtr pixmap,
+				  xf86CrtcPtr crtc, const BoxRec *clip);
+
+#if HAVE_DRI2_H
+bool sna_dri_open(struct sna *sna, ScreenPtr pScreen);
+void sna_dri_page_flip_handler(struct sna *sna, struct drm_event_vblank *event);
+void sna_dri_vblank_handler(struct sna *sna, struct drm_event_vblank *event);
+void sna_dri_destroy_window(WindowPtr win);
+void sna_dri_close(struct sna *sna, ScreenPtr pScreen);
+#else
+static inline bool sna_dri_open(struct sna *sna, ScreenPtr pScreen) { return false; }
+static inline void sna_dri_page_flip_handler(struct sna *sna, struct drm_event_vblank *event) { }
+static inline void sna_dri_vblank_handler(struct sna *sna, struct drm_event_vblank *event) { }
+static inline void sna_dri_destroy_window(WindowPtr win) { }
+static inline void sna_dri_close(struct sna *sna, ScreenPtr pScreen) { }
+#endif
+void sna_dri_pixmap_update_bo(struct sna *sna, PixmapPtr pixmap);
+
+extern int sna_crtc_to_pipe(xf86CrtcPtr crtc);
+extern int sna_crtc_to_plane(xf86CrtcPtr crtc);
+extern int sna_crtc_id(xf86CrtcPtr crtc);
+
+CARD32 sna_format_for_depth(int depth);
+CARD32 sna_render_format_for_depth(int depth);
+
+void sna_debug_flush(struct sna *sna);
+
+static inline void
+get_drawable_deltas(DrawablePtr drawable, PixmapPtr pixmap, int16_t *x, int16_t *y)
+{
+#ifdef COMPOSITE
+	if (drawable->type == DRAWABLE_WINDOW) {
+		*x = -pixmap->screen_x;
+		*y = -pixmap->screen_y;
+		return;
+	}
+#endif
+	*x = *y = 0;
+}
+
+static inline int
+get_drawable_dx(DrawablePtr drawable)
+{
+#ifdef COMPOSITE
+	if (drawable->type == DRAWABLE_WINDOW)
+		return -get_drawable_pixmap(drawable)->screen_x;
+#endif
+	return 0;
+}
+
+static inline int
+get_drawable_dy(DrawablePtr drawable)
+{
+#ifdef COMPOSITE
+	if (drawable->type == DRAWABLE_WINDOW)
+		return -get_drawable_pixmap(drawable)->screen_y;
+#endif
+	return 0;
+}
+
+bool sna_pixmap_attach_to_bo(PixmapPtr pixmap, struct kgem_bo *bo);
+static inline bool sna_pixmap_is_scanout(struct sna *sna, PixmapPtr pixmap)
+{
+	return (pixmap == sna->front &&
+		!sna->mode.shadow_active &&
+		(sna->flags & SNA_NO_WAIT) == 0);
+}
+
+PixmapPtr sna_pixmap_create_upload(ScreenPtr screen,
+				   int width, int height, int depth,
+				   unsigned flags);
+PixmapPtr sna_pixmap_create_unattached(ScreenPtr screen,
+				       int width, int height, int depth);
+void sna_pixmap_destroy(PixmapPtr pixmap);
+
+#define MOVE_WRITE 0x1
+#define MOVE_READ 0x2
+#define MOVE_INPLACE_HINT 0x4
+#define MOVE_ASYNC_HINT 0x8
+#define MOVE_SOURCE_HINT 0x10
+#define MOVE_WHOLE_HINT 0x20
+#define __MOVE_FORCE 0x40
+#define __MOVE_DRI 0x80
+
+struct sna_pixmap *sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags);
+static inline struct sna_pixmap *
+sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
+{
+	/* Unlike move-to-gpu, we ignore wedged and always create the GPU bo */
+	DBG(("%s(pixmap=%p, flags=%x)\n", __FUNCTION__, pixmap, flags));
+	return sna_pixmap_move_to_gpu(pixmap, flags | __MOVE_FORCE);
+}
+bool must_check _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned flags);
+static inline bool must_check sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned flags)
+{
+	if (flags == MOVE_READ) {
+		struct sna_pixmap *priv = sna_pixmap(pixmap);
+		if (priv == NULL)
+			return true;
+	}
+
+	return _sna_pixmap_move_to_cpu(pixmap, flags);
+}
+bool must_check sna_drawable_move_region_to_cpu(DrawablePtr drawable,
+						RegionPtr region,
+						unsigned flags);
+
+bool must_check sna_drawable_move_to_cpu(DrawablePtr drawable, unsigned flags);
+
+static inline bool must_check
+sna_drawable_move_to_gpu(DrawablePtr drawable, unsigned flags)
+{
+	return sna_pixmap_move_to_gpu(get_drawable_pixmap(drawable), flags) != NULL;
+}
+
+void sna_add_flush_pixmap(struct sna *sna,
+			  struct sna_pixmap *priv,
+			  struct kgem_bo *bo);
+
+struct kgem_bo *sna_pixmap_change_tiling(PixmapPtr pixmap, uint32_t tiling);
+
+#define PREFER_GPU	0x1
+#define FORCE_GPU	0x2
+#define RENDER_GPU	0x4
+#define IGNORE_CPU	0x8
+#if 0
+struct kgem_bo *
+sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
+		    struct sna_damage ***damage);
+#endif
+
+inline static int16_t bound(int16_t a, uint16_t b)
+{
+	int v = (int)a + (int)b;
+	if (v > MAXSHORT)
+		return MAXSHORT;
+	return v;
+}
+
+inline static int16_t clamp(int16_t a, int16_t b)
+{
+	int v = (int)a + (int)b;
+	if (v > MAXSHORT)
+		return MAXSHORT;
+	if (v < MINSHORT)
+		return MINSHORT;
+	return v;
+}
+
+static inline bool
+box_inplace(PixmapPtr pixmap, const BoxRec *box)
+{
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	return ((int)(box->x2 - box->x1) * (int)(box->y2 - box->y1) * pixmap->drawable.bitsPerPixel >> 12) >= sna->kgem.half_cpu_cache_pages;
+}
+
+static inline bool
+region_subsumes_drawable(RegionPtr region, DrawablePtr drawable)
+{
+	const BoxRec *extents;
+
+	if (region->data)
+		return false;
+
+	extents = RegionExtents(region);
+	return  extents->x1 <= 0 && extents->y1 <= 0 &&
+		extents->x2 >= drawable->width &&
+		extents->y2 >= drawable->height;
+}
+
+#if 0
+static inline bool
+region_subsumes_damage(const RegionRec *region, struct sna_damage *damage)
+{
+	const BoxRec *re, *de;
+
+	DBG(("%s?\n", __FUNCTION__));
+	assert(damage);
+
+	re = &region->extents;
+	de = &DAMAGE_PTR(damage)->extents;
+	DBG(("%s: region (%d, %d), (%d, %d), damage (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     re->x1, re->y1, re->x2, re->y2,
+	     de->x1, de->y1, de->x2, de->y2));
+
+	if (re->x2 < de->x2 || re->x1 > de->x1 ||
+	    re->y2 < de->y2 || re->y1 > de->y1) {
+		DBG(("%s: not contained\n", __FUNCTION__));
+		return false;
+	}
+
+	if (region->data == NULL) {
+		DBG(("%s: singular region contains damage\n", __FUNCTION__));
+		return true;
+	}
+
+	return pixman_region_contains_rectangle((RegionPtr)region,
+						(BoxPtr)de) == PIXMAN_REGION_IN;
+}
+#endif
+
+
+static inline bool
+sna_drawable_is_clear(DrawablePtr d)
+{
+	struct sna_pixmap *priv = sna_pixmap(get_drawable_pixmap(d));
+	return priv && priv->clear && priv->clear_color == 0;
+}
+
+static inline struct kgem_bo *sna_pixmap_get_bo(PixmapPtr pixmap)
+{
+	return sna_pixmap(pixmap)->gpu_bo;
+}
+
+static inline struct kgem_bo *sna_pixmap_pin(PixmapPtr pixmap, unsigned flags)
+{
+	struct sna_pixmap *priv;
+
+	priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
+	if (!priv)
+		return NULL;
+
+	priv->pinned |= flags;
+	return priv->gpu_bo;
+}
+
+
+static inline bool
+_sna_transform_point(const PictTransform *transform,
+		     int64_t x, int64_t y, int64_t result[3])
+{
+	int j;
+
+	for (j = 0; j < 3; j++)
+		result[j] = (transform->matrix[j][0] * x +
+			     transform->matrix[j][1] * y +
+			     transform->matrix[j][2]);
+
+	return result[2] != 0;
+}
+
+static inline void
+_sna_get_transformed_coordinates(int x, int y,
+				 const PictTransform *transform,
+				 float *x_out, float *y_out)
+{
+
+	int64_t result[3];
+
+	_sna_transform_point(transform, x, y, result);
+	*x_out = result[0] / (double)result[2];
+	*y_out = result[1] / (double)result[2];
+}
+
+void
+sna_get_transformed_coordinates(int x, int y,
+				const PictTransform *transform,
+				float *x_out, float *y_out);
+
+void
+sna_get_transformed_coordinates_3d(int x, int y,
+				   const PictTransform *transform,
+				   float *x_out, float *y_out, float *z_out);
+
+bool sna_transform_is_affine(const PictTransform *t);
+bool sna_transform_is_integer_translation(const PictTransform *t,
+					  int16_t *tx, int16_t *ty);
+bool sna_transform_is_translation(const PictTransform *t,
+				  pixman_fixed_t *tx, pixman_fixed_t *ty);
+
+static inline bool
+sna_transform_equal(const PictTransform *a, const PictTransform *b)
+{
+	if (a == b)
+		return true;
+
+	if (a == NULL || b == NULL)
+		return false;
+
+	return memcmp(a, b, sizeof(*a)) == 0;
+}
+
+static inline bool
+sna_picture_alphamap_equal(PicturePtr a, PicturePtr b)
+{
+	if (a->alphaMap != b->alphaMap)
+		return false;
+
+	if (a->alphaMap)
+		return false;
+
+	return (a->alphaOrigin.x == b->alphaOrigin.x &&
+		a->alphaOrigin.y == b->alphaOrigin.y);
+}
+
+static inline bool wedged(struct sna *sna)
+{
+	return unlikely(sna->kgem.wedged);
+}
+
+static inline bool can_render(struct sna *sna)
+{
+	return likely(!sna->kgem.wedged && sna->have_render);
+}
+
+static inline uint32_t pixmap_size(PixmapPtr pixmap)
+{
+	return (pixmap->drawable.height - 1) * pixmap->devKind +
+		pixmap->drawable.width * pixmap->drawable.bitsPerPixel/8;
+}
+
+bool sna_accel_init(ScreenPtr sreen, struct sna *sna);
+void sna_accel_create(struct sna *sna);
+void sna_accel_block_handler(struct sna *sna, struct timeval **tv);
+void sna_accel_wakeup_handler(struct sna *sna);
+void sna_accel_watch_flush(struct sna *sna, int enable);
+void sna_accel_close(struct sna *sna);
+void sna_accel_free(struct sna *sna);
+
+void sna_copy_fbcon(struct sna *sna);
+
+bool sna_composite_create(struct sna *sna);
+void sna_composite_close(struct sna *sna);
+
+void sna_composite(CARD8 op,
+		   PicturePtr src,
+		   PicturePtr mask,
+		   PicturePtr dst,
+		   INT16 src_x,  INT16 src_y,
+		   INT16 mask_x, INT16 mask_y,
+		   INT16 dst_x,  INT16 dst_y,
+		   CARD16 width, CARD16 height);
+void sna_composite_rectangles(CARD8		 op,
+			      PicturePtr		 dst,
+			      xRenderColor	*color,
+			      int			 num_rects,
+			      xRectangle		*rects);
+void sna_composite_trapezoids(CARD8 op,
+			      PicturePtr src,
+			      PicturePtr dst,
+			      PictFormatPtr maskFormat,
+			      INT16 xSrc, INT16 ySrc,
+			      int ntrap, xTrapezoid *traps);
+void sna_add_traps(PicturePtr picture, INT16 x, INT16 y, int n, xTrap *t);
+
+void sna_composite_triangles(CARD8 op,
+			     PicturePtr src,
+			     PicturePtr dst,
+			     PictFormatPtr maskFormat,
+			     INT16 xSrc, INT16 ySrc,
+			     int ntri, xTriangle *tri);
+
+void sna_composite_tristrip(CARD8 op,
+			    PicturePtr src,
+			    PicturePtr dst,
+			    PictFormatPtr maskFormat,
+			    INT16 xSrc, INT16 ySrc,
+			    int npoints, xPointFixed *points);
+
+void sna_composite_trifan(CARD8 op,
+			  PicturePtr src,
+			  PicturePtr dst,
+			  PictFormatPtr maskFormat,
+			  INT16 xSrc, INT16 ySrc,
+			  int npoints, xPointFixed *points);
+
+bool sna_gradients_create(struct sna *sna);
+void sna_gradients_close(struct sna *sna);
+
+bool sna_glyphs_create(struct sna *sna);
+void sna_glyphs(CARD8 op,
+		PicturePtr src,
+		PicturePtr dst,
+		PictFormatPtr mask,
+		INT16 xSrc, INT16 ySrc,
+		int nlist,
+		GlyphListPtr list,
+		GlyphPtr *glyphs);
+void sna_glyphs__shared(CARD8 op,
+			PicturePtr src,
+			PicturePtr dst,
+			PictFormatPtr mask,
+			INT16 src_x, INT16 src_y,
+			int nlist, GlyphListPtr list, GlyphPtr *glyphs);
+void sna_glyph_unrealize(ScreenPtr screen, GlyphPtr glyph);
+void sna_glyphs_close(struct sna *sna);
+
+void sna_read_boxes(struct sna *sna,
+		    struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+		    PixmapPtr dst, int16_t dst_dx, int16_t dst_dy,
+		    const BoxRec *box, int n);
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
+		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
+		     const BoxRec *box, int n);
+void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
+			  struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			  const void *src, int stride, int16_t src_dx, int16_t src_dy,
+			  const BoxRec *box, int nbox,
+			  uint32_t and, uint32_t or);
+
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **bo,
+		 const void *src, int stride);
+struct kgem_bo *sna_replace__xor(struct sna *sna,
+				 PixmapPtr pixmap,
+				 struct kgem_bo *bo,
+				 const void *src, int stride,
+				 uint32_t and, uint32_t or);
+
+bool
+sna_compute_composite_extents(BoxPtr extents,
+			      PicturePtr src, PicturePtr mask, PicturePtr dst,
+			      INT16 src_x,  INT16 src_y,
+			      INT16 mask_x, INT16 mask_y,
+			      INT16 dst_x,  INT16 dst_y,
+			      CARD16 width, CARD16 height);
+bool
+sna_compute_composite_region(RegionPtr region,
+			     PicturePtr src, PicturePtr mask, PicturePtr dst,
+			     INT16 src_x,  INT16 src_y,
+			     INT16 mask_x, INT16 mask_y,
+			     INT16 dst_x,  INT16 dst_y,
+			     CARD16 width, CARD16 height);
+
+void
+memcpy_blt(const void *src, void *dst, int bpp,
+	   int32_t src_stride, int32_t dst_stride,
+	   int16_t src_x, int16_t src_y,
+	   int16_t dst_x, int16_t dst_y,
+	   uint16_t width, uint16_t height);
+void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height);
+void
+memmove_box(const void *src, void *dst,
+	    int bpp, int32_t stride,
+	    const BoxRec *box,
+	    int dx, int dy);
+
+void
+memcpy_xor(const void *src, void *dst, int bpp,
+	   int32_t src_stride, int32_t dst_stride,
+	   int16_t src_x, int16_t src_y,
+	   int16_t dst_x, int16_t dst_y,
+	   uint16_t width, uint16_t height,
+	   uint32_t and, uint32_t or);
+
+#define SNA_CREATE_FB 0x10
+#define SNA_CREATE_SCRATCH 0x11
+
+inline static bool is_power_of_two(unsigned x)
+{
+	return (x & (x-1)) == 0;
+}
+
+inline static bool is_clipped(const RegionRec *r,
+			      const DrawableRec *d)
+{
+	return (r->data ||
+		r->extents.x2 - r->extents.x1 != d->width ||
+		r->extents.y2 - r->extents.y1 != d->height);
+}
+
+#endif /* _SNA_H */
diff --git a/cogl/driver/drm/sna_reg.h b/cogl/driver/drm/sna_reg.h
new file mode 100644
index 00000000..26282361
--- /dev/null
+++ b/cogl/driver/drm/sna_reg.h
@@ -0,0 +1,82 @@
+#ifndef SNA_REG_H
+#define SNA_REG_H
+
+/* Flush */
+#define MI_FLUSH			(0x04<<23)
+#define MI_FLUSH_DW			(0x26<<23)
+
+#define MI_WRITE_DIRTY_STATE		(1<<4)
+#define MI_END_SCENE			(1<<3)
+#define MI_GLOBAL_SNAPSHOT_COUNT_RESET	(1<<3)
+#define MI_INHIBIT_RENDER_CACHE_FLUSH	(1<<2)
+#define MI_STATE_INSTRUCTION_CACHE_FLUSH (1<<1)
+#define MI_INVALIDATE_MAP_CACHE		(1<<0)
+/* broadwater flush bits */
+#define BRW_MI_GLOBAL_SNAPSHOT_RESET   (1 << 3)
+
+#define MI_BATCH_BUFFER_END	(0xA << 23)
+
+/* Noop */
+#define MI_NOOP				0x00
+#define MI_NOOP_WRITE_ID		(1<<22)
+#define MI_NOOP_ID_MASK			(1<<22 - 1)
+
+/* Wait for Events */
+#define MI_WAIT_FOR_EVENT			(0x03<<23)
+#define MI_WAIT_FOR_PIPEB_SVBLANK		(1<<18)
+#define MI_WAIT_FOR_PIPEA_SVBLANK		(1<<17)
+#define MI_WAIT_FOR_OVERLAY_FLIP		(1<<16)
+#define MI_WAIT_FOR_PIPEB_VBLANK		(1<<7)
+#define MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW	(1<<5)
+#define MI_WAIT_FOR_PIPEA_VBLANK		(1<<3)
+#define MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW	(1<<1)
+
+/* Set the scan line for MI_WAIT_FOR_PIPE?_SCAN_LINE_WINDOW */
+#define MI_LOAD_SCAN_LINES_INCL			(0x12<<23)
+#define MI_LOAD_SCAN_LINES_DISPLAY_PIPEA	(0)
+#define MI_LOAD_SCAN_LINES_DISPLAY_PIPEB	(0x1<<20)
+
+/* BLT commands */
+#define BLT_WRITE_ALPHA		(1<<21)
+#define BLT_WRITE_RGB		(1<<20)
+#define BLT_SRC_TILED		(1<<15)
+#define BLT_DST_TILED		(1<<11)
+
+#define COLOR_BLT_CMD			((2<<29)|(0x40<<22)|(0x3))
+#define XY_COLOR_BLT			((2<<29)|(0x50<<22)|(0x4))
+#define XY_SETUP_BLT			((2<<29)|(1<<22)|6)
+#define XY_SETUP_MONO_PATTERN_SL_BLT	((2<<29)|(0x11<<22)|7)
+#define XY_SETUP_CLIP			((2<<29)|(3<<22)|1)
+#define XY_SCANLINE_BLT			((2<<29)|(0x25<<22)|1)
+#define XY_TEXT_IMMEDIATE_BLT		((2<<29)|(0x31<<22)|(1<<16))
+#define XY_SRC_COPY_BLT_CMD		((2<<29)|(0x53<<22)|6)
+#define SRC_COPY_BLT_CMD		((2<<29)|(0x43<<22)|0x4)
+#define XY_PAT_BLT			((2<<29)|(0x51<<22)|0x4)
+#define XY_PAT_BLT_IMMEDIATE		((2<<29)|(0x72<<22))
+#define XY_MONO_PAT			((0x2<<29)|(0x52<<22)|0x7)
+#define XY_MONO_SRC_COPY		((0x2<<29)|(0x54<<22)|(0x6))
+#define XY_MONO_SRC_COPY_IMM		((0x2<<29)|(0x71<<22))
+#define XY_FULL_MONO_PATTERN_BLT	((0x2<<29)|(0x57<<22)|0xa)
+#define XY_FULL_MONO_PATTERN_MONO_SRC_BLT	((0x2<<29)|(0x58<<22)|0xa)
+
+/* FLUSH commands */
+#define BRW_3D(Pipeline,Opcode,Subopcode) \
+	((3 << 29) | \
+	 ((Pipeline) << 27) | \
+	 ((Opcode) << 24) | \
+	 ((Subopcode) << 16))
+#define PIPE_CONTROL		BRW_3D(3, 2, 0)
+#define PIPE_CONTROL_NOWRITE       (0 << 14)
+#define PIPE_CONTROL_WRITE_QWORD   (1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH   (2 << 14)
+#define PIPE_CONTROL_WRITE_TIME    (3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL   (1 << 13)
+#define PIPE_CONTROL_WC_FLUSH      (1 << 12)
+#define PIPE_CONTROL_IS_FLUSH      (1 << 11)
+#define PIPE_CONTROL_TC_FLUSH      (1 << 10)
+#define PIPE_CONTROL_NOTIFY_ENABLE (1 << 8)
+#define PIPE_CONTROL_GLOBAL_GTT    (1 << 2)
+#define PIPE_CONTROL_LOCAL_PGTT    (0 << 2)
+#define PIPE_CONTROL_DEPTH_CACHE_FLUSH	(1 << 0)
+
+#endif
diff --git a/cogl/driver/drm/sna_render.h b/cogl/driver/drm/sna_render.h
new file mode 100644
index 00000000..03a70057
--- /dev/null
+++ b/cogl/driver/drm/sna_render.h
@@ -0,0 +1,720 @@
+#ifndef SNA_RENDER_H
+#define SNA_RENDER_H
+
+#include "compiler.h"
+
+#include <picturestr.h>
+
+#define GRADIENT_CACHE_SIZE 16
+
+#define GXinvalid 0xff
+
+struct sna;
+struct sna_glyph;
+struct sna_video;
+struct sna_video_frame;
+struct brw_compile;
+
+struct sna_composite_rectangles {
+	struct sna_coordinate {
+		int16_t x, y;
+	} src, mask, dst;
+	int16_t width, height;
+};
+
+struct sna_composite_op {
+	fastcall void (*blt)(struct sna *sna, const struct sna_composite_op *op,
+			     const struct sna_composite_rectangles *r);
+	fastcall void (*box)(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const BoxRec *box);
+	void (*boxes)(struct sna *sna, const struct sna_composite_op *op,
+		      const BoxRec *box, int nbox);
+	void (*done)(struct sna *sna, const struct sna_composite_op *op);
+
+	struct sna_damage **damage;
+
+	uint32_t op;
+
+	struct {
+		PixmapPtr pixmap;
+		CARD32 format;
+		struct kgem_bo *bo;
+		int16_t x, y;
+		uint16_t width, height;
+	} dst;
+
+	struct sna_composite_channel {
+		struct kgem_bo *bo;
+		PictTransform *transform;
+		uint16_t width;
+		uint16_t height;
+		uint32_t pict_format;
+		uint32_t card_format;
+		uint32_t filter;
+		uint32_t repeat;
+		uint32_t is_affine : 1;
+		uint32_t is_solid : 1;
+		uint32_t is_linear : 1;
+		uint32_t is_opaque : 1;
+		uint32_t alpha_fixup : 1;
+		uint32_t rb_reversed : 1;
+		int16_t offset[2];
+		float scale[2];
+
+		pixman_transform_t embedded_transform;
+
+		union {
+			struct {
+				uint32_t pixel;
+				float linear_dx;
+				float linear_dy;
+				float linear_offset;
+			} gen2;
+			struct gen3_shader_channel {
+				int type;
+				uint32_t mode;
+				uint32_t constants;
+			} gen3;
+		} u;
+	} src, mask;
+	uint32_t is_affine : 1;
+	uint32_t has_component_alpha : 1;
+	uint32_t need_magic_ca_pass : 1;
+	uint32_t rb_reversed : 1;
+
+	int16_t floats_per_vertex;
+	int16_t floats_per_rect;
+	fastcall void (*prim_emit)(struct sna *sna,
+				   const struct sna_composite_op *op,
+				   const struct sna_composite_rectangles *r);
+
+	struct sna_composite_redirect {
+		struct kgem_bo *real_bo;
+		struct sna_damage **real_damage, *damage;
+		BoxRec box;
+	} redirect;
+
+	union {
+		struct sna_blt_state {
+			PixmapPtr src_pixmap;
+			int16_t sx, sy;
+
+			uint32_t inplace :1;
+			uint32_t overwrites:1;
+			uint32_t bpp : 6;
+
+			uint32_t cmd;
+			uint32_t br13;
+			uint32_t pitch[2];
+			uint32_t pixel;
+			struct kgem_bo *bo[2];
+		} blt;
+
+		struct {
+			float constants[8];
+			uint32_t num_constants;
+		} gen3;
+
+		struct {
+			int wm_kernel;
+			int ve_id;
+		} gen4;
+
+		struct {
+			int wm_kernel;
+			int ve_id;
+		} gen5;
+
+		struct {
+			uint32_t flags;
+		} gen6;
+
+		struct {
+			uint32_t flags;
+		} gen7;
+	} u;
+
+	void *priv;
+};
+
+struct sna_composite_spans_op {
+	struct sna_composite_op base;
+
+	fastcall void (*box)(struct sna *sna,
+			     const struct sna_composite_spans_op *op,
+			     const BoxRec *box,
+			     float opacity);
+	void (*boxes)(struct sna *sna,
+		      const struct sna_composite_spans_op *op,
+		      const BoxRec *box, int nbox,
+		      float opacity);
+	fastcall void (*done)(struct sna *sna,
+			      const struct sna_composite_spans_op *op);
+
+	fastcall void (*prim_emit)(struct sna *sna,
+				   const struct sna_composite_spans_op *op,
+				   const BoxRec *box,
+				   float opacity);
+};
+
+struct sna_fill_op {
+	struct sna_composite_op base;
+
+	void (*blt)(struct sna *sna, const struct sna_fill_op *op,
+		    int16_t x, int16_t y, int16_t w, int16_t h);
+	fastcall void (*box)(struct sna *sna,
+			     const struct sna_fill_op *op,
+			     const BoxRec *box);
+	fastcall void (*boxes)(struct sna *sna,
+			       const struct sna_fill_op *op,
+			       const BoxRec *box,
+			       int count);
+	void (*done)(struct sna *sna, const struct sna_fill_op *op);
+};
+
+struct sna_copy_op {
+	struct sna_composite_op base;
+
+	void (*blt)(struct sna *sna, const struct sna_copy_op *op,
+		    int16_t sx, int16_t sy,
+		    int16_t w, int16_t h,
+		    int16_t dx, int16_t dy);
+	void (*done)(struct sna *sna, const struct sna_copy_op *op);
+};
+
+struct sna_render {
+	int max_3d_size;
+	int max_3d_pitch;
+
+	bool (*composite)(struct sna *sna, uint8_t op,
+			  PicturePtr dst, PicturePtr src, PicturePtr mask,
+			  int16_t src_x, int16_t src_y,
+			  int16_t msk_x, int16_t msk_y,
+			  int16_t dst_x, int16_t dst_y,
+			  int16_t w, int16_t h,
+			  struct sna_composite_op *tmp);
+
+	bool (*check_composite_spans)(struct sna *sna, uint8_t op,
+				      PicturePtr dst, PicturePtr src,
+				      int16_t w, int16_t h, unsigned flags);
+	bool (*composite_spans)(struct sna *sna, uint8_t op,
+				PicturePtr dst, PicturePtr src,
+				int16_t src_x, int16_t src_y,
+				int16_t dst_x, int16_t dst_y,
+				int16_t w, int16_t h,
+				unsigned flags,
+				struct sna_composite_spans_op *tmp);
+#define COMPOSITE_SPANS_RECTILINEAR 0x1
+#define COMPOSITE_SPANS_INPLACE_HINT 0x2
+
+	bool (*video)(struct sna *sna,
+		      struct sna_video *video,
+		      struct sna_video_frame *frame,
+		      RegionPtr dstRegion,
+		      short src_w, short src_h,
+		      short drw_w, short drw_h,
+		      PixmapPtr pixmap);
+
+	bool (*fill_boxes)(struct sna *sna,
+			   CARD8 op,
+			   PictFormat format,
+			   const xRenderColor *color,
+			   PixmapPtr dst, struct kgem_bo *dst_bo,
+			   const BoxRec *box, int n);
+	bool (*fill)(struct sna *sna, uint8_t alu,
+		     PixmapPtr dst, struct kgem_bo *dst_bo,
+		     uint32_t color,
+		     struct sna_fill_op *tmp);
+	bool (*fill_one)(struct sna *sna, PixmapPtr dst, struct kgem_bo *dst_bo,
+			 uint32_t color,
+			 int16_t x1, int16_t y1, int16_t x2, int16_t y2,
+			 uint8_t alu);
+	bool (*clear)(struct sna *sna, PixmapPtr dst, struct kgem_bo *dst_bo);
+
+	bool (*copy_boxes)(struct sna *sna, uint8_t alu,
+			   PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			   PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			   const BoxRec *box, int n, unsigned flags);
+#define COPY_LAST 0x1
+
+	bool (*copy)(struct sna *sna, uint8_t alu,
+		     PixmapPtr src, struct kgem_bo *src_bo,
+		     PixmapPtr dst, struct kgem_bo *dst_bo,
+		     struct sna_copy_op *op);
+
+	void (*flush)(struct sna *sna);
+	void (*reset)(struct sna *sna);
+	void (*fini)(struct sna *sna);
+
+	struct sna_alpha_cache {
+		struct kgem_bo *cache_bo;
+		struct kgem_bo *bo[256];
+	} alpha_cache;
+
+	struct sna_solid_cache {
+		struct kgem_bo *cache_bo;
+		uint32_t color[1024];
+		struct kgem_bo *bo[1024];
+		int last;
+		int size;
+		int dirty;
+	} solid_cache;
+
+	struct {
+		struct sna_gradient_cache {
+			struct kgem_bo *bo;
+			int nstops;
+			PictGradientStop *stops;
+		} cache[GRADIENT_CACHE_SIZE];
+		int size;
+	} gradient_cache;
+
+	struct sna_glyph_cache{
+		PicturePtr picture;
+		struct sna_glyph **glyphs;
+		uint16_t count;
+		uint16_t evict;
+	} glyph[2];
+	pixman_image_t *white_image;
+	PicturePtr white_picture;
+#if HAS_PIXMAN_GLYPHS
+	pixman_glyph_cache_t *glyph_cache;
+#endif
+
+	uint16_t vertex_start;
+	uint16_t vertex_index;
+	uint16_t vertex_used;
+	uint16_t vertex_size;
+	uint16_t vertex_reloc[16];
+	int nvertex_reloc;
+
+	struct kgem_bo *vbo;
+	float *vertices;
+
+	float vertex_data[1024];
+};
+
+struct gen2_render_state {
+	uint32_t target;
+	bool need_invariant;
+	uint32_t logic_op_enabled;
+	uint32_t ls1, ls2, vft;
+	uint32_t diffuse;
+	uint32_t specular;
+	uint16_t vertex_offset;
+};
+
+struct gen3_render_state {
+	uint32_t current_dst;
+	bool need_invariant;
+	uint32_t tex_count;
+	uint32_t last_drawrect_limit;
+	uint32_t last_target;
+	uint32_t last_blend;
+	uint32_t last_constants;
+	uint32_t last_sampler;
+	uint32_t last_shader;
+	uint32_t last_diffuse;
+	uint32_t last_specular;
+
+	uint16_t vertex_offset;
+	uint16_t last_vertex_offset;
+	uint16_t floats_per_vertex;
+	uint16_t last_floats_per_vertex;
+
+	uint32_t tex_map[4];
+	uint32_t tex_handle[2];
+	uint32_t tex_delta[2];
+};
+
+struct gen4_render_state {
+	struct kgem_bo *general_bo;
+
+	uint32_t vs;
+	uint32_t sf[2];
+	uint32_t wm;
+	uint32_t cc;
+
+	int ve_id;
+	uint32_t drawrect_offset;
+	uint32_t drawrect_limit;
+	uint32_t vb_id;
+	uint32_t last_pipelined_pointers;
+	uint16_t vertex_offset;
+	uint16_t last_primitive;
+	int16_t floats_per_vertex;
+	uint16_t surface_table;
+
+	bool needs_invariant;
+	bool needs_urb;
+};
+
+struct gen5_render_state {
+	struct kgem_bo *general_bo;
+
+	uint32_t vs;
+	uint32_t sf[2];
+	uint32_t wm;
+	uint32_t cc;
+
+	int ve_id;
+	uint32_t drawrect_offset;
+	uint32_t drawrect_limit;
+	uint32_t vb_id;
+	uint16_t vertex_offset;
+	uint16_t last_primitive;
+	int16_t floats_per_vertex;
+	uint16_t surface_table;
+	uint16_t last_pipelined_pointers;
+
+	bool needs_invariant;
+};
+
+enum {
+	GEN6_WM_KERNEL_NOMASK = 0,
+	GEN6_WM_KERNEL_NOMASK_P,
+
+	GEN6_WM_KERNEL_MASK,
+	GEN6_WM_KERNEL_MASK_P,
+
+	GEN6_WM_KERNEL_MASKCA,
+	GEN6_WM_KERNEL_MASKCA_P,
+
+	GEN6_WM_KERNEL_MASKSA,
+	GEN6_WM_KERNEL_MASKSA_P,
+
+	GEN6_WM_KERNEL_OPACITY,
+	GEN6_WM_KERNEL_OPACITY_P,
+
+	GEN6_WM_KERNEL_VIDEO_PLANAR,
+	GEN6_WM_KERNEL_VIDEO_PACKED,
+	GEN6_KERNEL_COUNT
+};
+
+struct gen6_render_state {
+	const struct gt_info *info;
+	struct kgem_bo *general_bo;
+
+	uint32_t vs_state;
+	uint32_t sf_state;
+	uint32_t sf_mask_state;
+	uint32_t wm_state;
+	uint32_t wm_kernel[GEN6_KERNEL_COUNT][3];
+
+	uint32_t cc_vp;
+	uint32_t cc_blend;
+
+	uint32_t drawrect_offset;
+	uint32_t drawrect_limit;
+	uint32_t blend;
+	uint32_t samplers;
+	uint32_t kernel;
+
+	uint16_t num_sf_outputs;
+	uint16_t vb_id;
+	uint16_t ve_id;
+	uint16_t vertex_offset;
+	uint16_t last_primitive;
+	int16_t floats_per_vertex;
+	uint16_t surface_table;
+
+	bool needs_invariant;
+	bool first_state_packet;
+};
+
+enum {
+	GEN7_WM_KERNEL_NOMASK = 0,
+	GEN7_WM_KERNEL_NOMASK_P,
+
+	GEN7_WM_KERNEL_MASK,
+	GEN7_WM_KERNEL_MASK_P,
+
+	GEN7_WM_KERNEL_MASKCA,
+	GEN7_WM_KERNEL_MASKCA_P,
+
+	GEN7_WM_KERNEL_MASKSA,
+	GEN7_WM_KERNEL_MASKSA_P,
+
+	GEN7_WM_KERNEL_OPACITY,
+	GEN7_WM_KERNEL_OPACITY_P,
+
+	GEN7_WM_KERNEL_VIDEO_PLANAR,
+	GEN7_WM_KERNEL_VIDEO_PACKED,
+	GEN7_WM_KERNEL_COUNT
+};
+
+struct gen7_render_state {
+	const struct gt_info *info;
+	struct kgem_bo *general_bo;
+
+	uint32_t vs_state;
+	uint32_t sf_state;
+	uint32_t sf_mask_state;
+	uint32_t wm_state;
+	uint32_t wm_kernel[GEN7_WM_KERNEL_COUNT][3];
+
+	uint32_t cc_vp;
+	uint32_t cc_blend;
+
+	uint32_t drawrect_offset;
+	uint32_t drawrect_limit;
+	uint32_t blend;
+	uint32_t samplers;
+	uint32_t kernel;
+
+	uint16_t num_sf_outputs;
+	uint16_t vb_id;
+	uint16_t ve_id;
+	uint16_t vertex_offset;
+	uint16_t last_primitive;
+	int16_t floats_per_vertex;
+	uint16_t surface_table;
+
+	bool needs_invariant;
+	bool emit_flush;
+};
+
+struct sna_static_stream {
+	uint32_t size, used;
+	uint8_t *data;
+};
+
+int sna_static_stream_init(struct sna_static_stream *stream);
+uint32_t sna_static_stream_add(struct sna_static_stream *stream,
+			       const void *data, uint32_t len, uint32_t align);
+void *sna_static_stream_map(struct sna_static_stream *stream,
+			    uint32_t len, uint32_t align);
+uint32_t sna_static_stream_offsetof(struct sna_static_stream *stream,
+				    void *ptr);
+unsigned sna_static_stream_compile_sf(struct sna *sna,
+				      struct sna_static_stream *stream,
+				      bool (*compile)(struct brw_compile *));
+
+unsigned sna_static_stream_compile_wm(struct sna *sna,
+				      struct sna_static_stream *stream,
+				      bool (*compile)(struct brw_compile *, int),
+				      int width);
+struct kgem_bo *sna_static_stream_fini(struct sna *sna,
+				       struct sna_static_stream *stream);
+
+struct kgem_bo *
+sna_render_get_solid(struct sna *sna,
+		     uint32_t color);
+
+void
+sna_render_flush_solid(struct sna *sna);
+
+struct kgem_bo *
+sna_render_get_gradient(struct sna *sna,
+			PictGradient *pattern);
+
+uint32_t sna_rgba_for_color(uint32_t color, int depth);
+uint32_t sna_rgba_to_color(uint32_t rgba, uint32_t format);
+bool sna_get_rgba_from_pixel(uint32_t pixel,
+			     uint16_t *red,
+			     uint16_t *green,
+			     uint16_t *blue,
+			     uint16_t *alpha,
+			     uint32_t format);
+bool sna_picture_is_solid(PicturePtr picture, uint32_t *color);
+
+void no_render_init(struct sna *sna);
+
+bool gen2_render_init(struct sna *sna);
+bool gen3_render_init(struct sna *sna);
+bool gen4_render_init(struct sna *sna);
+bool gen5_render_init(struct sna *sna);
+bool gen6_render_init(struct sna *sna);
+bool gen7_render_init(struct sna *sna);
+
+bool sna_tiling_composite(uint32_t op,
+			  PicturePtr src,
+			  PicturePtr mask,
+			  PicturePtr dst,
+			  int16_t src_x, int16_t src_y,
+			  int16_t mask_x, int16_t mask_y,
+			  int16_t dst_x, int16_t dst_y,
+			  int16_t width, int16_t height,
+			  struct sna_composite_op *tmp);
+bool sna_tiling_composite_spans(uint32_t op,
+				PicturePtr src,
+				PicturePtr dst,
+				int16_t src_x,  int16_t src_y,
+				int16_t dst_x,  int16_t dst_y,
+				int16_t width,  int16_t height,
+				unsigned flags,
+				struct sna_composite_spans_op *tmp);
+bool sna_tiling_fill_boxes(struct sna *sna,
+			   CARD8 op,
+			   PictFormat format,
+			   const xRenderColor *color,
+			   PixmapPtr dst, struct kgem_bo *dst_bo,
+			   const BoxRec *box, int n);
+
+bool sna_tiling_copy_boxes(struct sna *sna, uint8_t alu,
+			   PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			   PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			   const BoxRec *box, int n);
+
+bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
+			       struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			       struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			       int bpp, const BoxRec *box, int nbox);
+
+bool sna_blt_composite(struct sna *sna,
+		       uint32_t op,
+		       PicturePtr src,
+		       PicturePtr dst,
+		       int16_t src_x, int16_t src_y,
+		       int16_t dst_x, int16_t dst_y,
+		       int16_t width, int16_t height,
+		       struct sna_composite_op *tmp,
+		       bool fallback);
+bool sna_blt_composite__convert(struct sna *sna,
+				int x, int y,
+				int width, int height,
+				struct sna_composite_op *tmp);
+
+bool sna_blt_fill(struct sna *sna, uint8_t alu,
+		  struct kgem_bo *bo,
+		  int bpp,
+		  uint32_t pixel,
+		  struct sna_fill_op *fill);
+
+bool sna_blt_copy(struct sna *sna, uint8_t alu,
+		  struct kgem_bo *src,
+		  struct kgem_bo *dst,
+		  int bpp,
+		  struct sna_copy_op *copy);
+
+bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
+			struct kgem_bo *bo,
+			int bpp,
+			uint32_t pixel,
+			const BoxRec *box, int n);
+
+bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
+			struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			int bpp,
+			const BoxRec *box, int n);
+bool sna_blt_copy_boxes_fallback(struct sna *sna, uint8_t alu,
+				 PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+				 PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+				 const BoxRec *box, int nbox);
+
+bool _sna_get_pixel_from_rgba(uint32_t *pixel,
+			     uint16_t red,
+			     uint16_t green,
+			     uint16_t blue,
+			     uint16_t alpha,
+			     uint32_t format);
+
+static inline bool
+sna_get_pixel_from_rgba(uint32_t * pixel,
+			uint16_t red,
+			uint16_t green,
+			uint16_t blue,
+			uint16_t alpha,
+			uint32_t format)
+{
+	switch (format) {
+	case PICT_x8r8g8b8:
+		alpha = 0xffff;
+		/* fall through to re-use a8r8g8b8 expansion */
+	case PICT_a8r8g8b8:
+		*pixel = ((alpha >> 8 << 24) |
+			  (red >> 8 << 16) |
+			  (green & 0xff00) |
+			  (blue >> 8));
+		return TRUE;
+	case PICT_a8:
+		*pixel = alpha >> 8;
+		return TRUE;
+	}
+
+	return _sna_get_pixel_from_rgba(pixel, red, green, blue, alpha, format);
+}
+
+struct kgem_bo *
+__sna_render_pixmap_bo(struct sna *sna,
+		       PixmapPtr pixmap,
+		       const BoxRec *box,
+		       bool blt);
+
+int
+sna_render_pixmap_bo(struct sna *sna,
+		     struct sna_composite_channel *channel,
+		     PixmapPtr pixmap,
+		     int16_t x, int16_t y,
+		     int16_t w, int16_t h,
+		     int16_t dst_x, int16_t dst_y);
+
+bool
+sna_render_pixmap_partial(struct sna *sna,
+			  PixmapPtr pixmap,
+			  struct kgem_bo *bo,
+			  struct sna_composite_channel *channel,
+			  int16_t x, int16_t y,
+			  int16_t w, int16_t h);
+
+int
+sna_render_picture_extract(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   int16_t x, int16_t y,
+			   int16_t w, int16_t h,
+			   int16_t dst_x, int16_t dst_y);
+
+int
+sna_render_picture_approximate_gradient(struct sna *sna,
+					PicturePtr picture,
+					struct sna_composite_channel *channel,
+					int16_t x, int16_t y,
+					int16_t w, int16_t h,
+					int16_t dst_x, int16_t dst_y);
+
+int
+sna_render_picture_fixup(struct sna *sna,
+			 PicturePtr picture,
+			 struct sna_composite_channel *channel,
+			 int16_t x, int16_t y,
+			 int16_t w, int16_t h,
+			 int16_t dst_x, int16_t dst_y);
+
+int
+sna_render_picture_convert(struct sna *sna,
+			   PicturePtr picture,
+			   struct sna_composite_channel *channel,
+			   PixmapPtr pixmap,
+			   int16_t x, int16_t y,
+			   int16_t w, int16_t h,
+			   int16_t dst_x, int16_t dst_y);
+
+inline static void sna_render_composite_redirect_init(struct sna_composite_op *op)
+{
+	struct sna_composite_redirect *t = &op->redirect;
+	t->real_bo = NULL;
+	t->damage = NULL;
+}
+
+bool
+sna_render_composite_redirect(struct sna *sna,
+			      struct sna_composite_op *op,
+			      int x, int y, int width, int height);
+
+void
+sna_render_composite_redirect_done(struct sna *sna,
+				   const struct sna_composite_op *op);
+
+bool
+sna_render_copy_boxes__overlap(struct sna *sna, uint8_t alu,
+			       PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
+			       PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
+			       const BoxRec *box, int n, const BoxRec *extents);
+
+bool
+sna_composite_mask_is_opaque(PicturePtr mask);
+
+#endif /* SNA_RENDER_H */
diff --git a/cogl/winsys/cogl-winsys-drm-private.h b/cogl/winsys/cogl-winsys-drm-private.h
new file mode 100644
index 00000000..b3aaeac2
--- /dev/null
+++ b/cogl/winsys/cogl-winsys-drm-private.h
@@ -0,0 +1,30 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2011 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ */
+
+#ifndef _COGL_WINSYS_DRM_PRIVATE_H_
+#define _COGL_WINSYS_DRM_PRIVATE_H_
+
+const CoglWinsysVtable *
+_cogl_winsys_drm_get_vtable (void);
+
+#endif /* _COGL_WINSYS_DRM_PRIVATE_H_ */
diff --git a/cogl/winsys/cogl-winsys-drm.c b/cogl/winsys/cogl-winsys-drm.c
new file mode 100644
index 00000000..673557fd
--- /dev/null
+++ b/cogl/winsys/cogl-winsys-drm.c
@@ -0,0 +1,358 @@
+/*
+ * Cogl
+ *
+ * An object oriented GL/GLES Abstraction/Utility Layer
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Authors:
+ *  Robert Bragg <robert@linux.intel.com>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cogl-renderer-private.h"
+#include "cogl-display-private.h"
+#include "cogl-context-private.h"
+#include "cogl-framebuffer-private.h"
+#include "cogl-private.h"
+#include "cogl-winsys-drm-private.h"
+#include "cogl-error-private.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define LIBUDEV_I_KNOW_THE_API_IS_SUBJECT_TO_CHANGE
+#include <libudev.h>
+
+static int _cogl_winsys_drm_dummy_ptr;
+
+typedef struct _CoglRendererDRM
+{
+  dev_t devnum;
+  int vendor_id;
+  int chip_id;
+  int fd;
+} CoglRendererDRM;
+
+typedef struct _CoglDisplayDRM
+{
+  int padding;
+} CoglDisplayDRM;
+
+/* This provides a NOP winsys. This can be useful for debugging or for
+ * integrating with toolkits that already have window system
+ * integration code.
+ */
+
+static CoglFuncPtr
+_cogl_winsys_renderer_get_proc_address (CoglRenderer *renderer,
+                                        const char *name,
+                                        CoglBool in_core)
+{
+  static GModule *module = NULL;
+
+  /* this should find the right function if the program is linked against a
+   * library providing it */
+  if (G_UNLIKELY (module == NULL))
+    module = g_module_open (NULL, 0);
+
+  if (module)
+    {
+      void *symbol;
+
+      if (g_module_symbol (module, name, &symbol))
+        return symbol;
+    }
+
+  return NULL;
+}
+
+static void
+_cogl_winsys_renderer_disconnect (CoglRenderer *renderer)
+{
+  CoglRendererDRM *drm_renderer = renderer->winsys;
+
+  close (drm_renderer->fd);
+
+  g_slice_free (CoglRendererDRM, drm_renderer);
+
+  renderer->winsys = NULL;
+}
+
+static const char *
+get_udev_property (struct udev_device *device,
+                   const char *name)
+{
+  struct udev_list_entry *entry;
+
+  udev_list_entry_foreach (entry,
+                           udev_device_get_properties_list_entry (device))
+    {
+      if (strcmp (udev_list_entry_get_name (entry), name) == 0)
+        return udev_list_entry_get_value (entry);
+    }
+
+  return NULL;
+}
+
+static char *
+match_device (struct udev_device *device,
+              dev_t *devnum,
+              uint32_t *vendor_id,
+              uint32_t *chip_id)
+{
+  static const struct _Device {
+      uint32_t vendor_id;
+      uint32_t chip_id;
+  } devices[] = {
+        { 0x8086, 0x29a2 }, /* I965 G */
+        { 0x8086, 0x2982 }, /* G35 G */
+        { 0x8086, 0x2992 }, /* I965 Q */
+        { 0x8086, 0x2972 }, /* I946 GZ */
+        { 0x8086, 0x2a02 }, /* I965 GM */
+        { 0x8086, 0x2a12 }, /* I965 GME */
+        { 0x8086, 0x2e02 }, /* IGD E G */
+        { 0x8086, 0x2e22 }, /* G45 G */
+        { 0x8086, 0x2e12 }, /* Q45 G */
+        { 0x8086, 0x2e32 }, /* G41 G */
+        { 0x8086, 0x2a42 }, /* GM45 GM */
+
+        { 0x8086, 0x2582 }, /* I915 G */
+        { 0x8086, 0x2592 }, /* I915 GM */
+        { 0x8086, 0x258a }, /* E7221 G */
+        { 0x8086, 0x2772 }, /* I945 G */
+        { 0x8086, 0x27a2 }, /* I945 GM */
+        { 0x8086, 0x27ae }, /* I945 GME */
+        { 0x8086, 0x29c2 }, /* G33 G */
+        { 0x8086, 0x29b2 }, /* Q35 G */
+        { 0x8086, 0x29d2 }, /* Q33 G */
+        { 0x8086, 0xa011 }, /* IGD GM */
+        { 0x8086, 0xa001 }, /* IGD G */
+
+        /* XXX i830 */
+
+        { 0x8086, ~0 }, /* intel */
+  };
+
+  struct udev_device *parent;
+  const char *pci_id;
+  const char *path;
+  int i;
+
+  *devnum = udev_device_get_devnum (device);
+
+  parent = udev_device_get_parent (device);
+  pci_id = get_udev_property (parent, "PCI_ID");
+  if (pci_id == NULL || sscanf (pci_id, "%x:%x", vendor_id, chip_id) != 2)
+    return NULL;
+
+  for (i = 0; i < G_N_ELEMENTS (devices); i++)
+    {
+      if (devices[i].vendor_id == *vendor_id &&
+          (devices[i].chip_id == ~0U || devices[i].chip_id == *chip_id))
+        break;
+    }
+
+  if (i == G_N_ELEMENTS (devices))
+    return NULL;
+
+  path = udev_device_get_devnode (device);
+  if (path == NULL)
+    path = "/dev/dri/card0"; /* XXX buggy udev? */
+
+  return g_strdup (path);
+}
+
+static CoglBool
+_cogl_winsys_renderer_connect (CoglRenderer *renderer,
+                               CoglError **error)
+{
+  struct udev *udev;
+  struct udev_enumerate *e;
+  struct udev_list_entry *entry;
+  dev_t devnum;
+  int vendor_id;
+  int chip_id;
+  int fd = -1;
+  CoglRendererDRM *drm_renderer;
+
+  udev = udev_new ();
+  if (udev == NULL)
+    {
+      _cogl_set_error (error,
+                       COGL_WINSYS_ERROR,
+                       COGL_WINSYS_ERROR_INIT,
+                       "Failed to init udev api");
+      return FALSE;
+    }
+
+  e = udev_enumerate_new (udev);
+  udev_enumerate_add_match_subsystem (e, "drm");
+  udev_enumerate_scan_devices (e);
+  udev_list_entry_foreach (entry, udev_enumerate_get_list_entry (e))
+    {
+      struct udev_device *device =
+        udev_device_new_from_syspath (udev, udev_list_entry_get_name (entry));
+      char *path = match_device (device, &devnum, &vendor_id, &chip_id);
+
+      if (path)
+        {
+          g_print ("Matched device: %s\n", path);
+          fd = open (path, O_RDWR);
+          if (fd == -1)
+            {
+              g_warning ("Failed to open device node %s: %m", path);
+              continue;
+            }
+
+          break;
+        }
+
+      //g_print ("device %s\n", udev_list_entry_get_name (entry));
+      udev_device_unref (device);
+    }
+
+  udev_enumerate_unref (e);
+  udev_unref (udev);
+
+  if (fd == -1)
+    return FALSE;
+
+  drm_renderer = g_slice_new0 (CoglRendererDRM);
+  drm_renderer->devnum = denum;
+  drm_renderer->vendor_id = vendor_id;
+  drm_renderer->chip_id = chip_id;
+  drm_renderer->fd = fd;
+
+  renderer->winsys = drm_renderer;
+
+  return TRUE;
+}
+
+static void
+_cogl_winsys_display_destroy (CoglDisplay *display)
+{
+  display->winsys = NULL;
+}
+
+static CoglBool
+_cogl_winsys_display_setup (CoglDisplay *display,
+                            CoglError **error)
+{
+  display->winsys = &_cogl_winsys_drm_dummy_ptr;
+  return TRUE;
+}
+
+static CoglBool
+_cogl_winsys_context_init (CoglContext *context, CoglError **error)
+{
+  context->winsys = &_cogl_winsys_drm_dummy_ptr;
+
+  if (!_cogl_context_update_features (context, error))
+    return FALSE;
+
+  memset (context->winsys_features, 0, sizeof (context->winsys_features));
+
+  return TRUE;
+}
+
+static void
+_cogl_winsys_context_deinit (CoglContext *context)
+{
+  context->winsys = NULL;
+}
+
+static CoglBool
+_cogl_winsys_onscreen_init (CoglOnscreen *onscreen,
+                            CoglError **error)
+{
+  return TRUE;
+}
+
+static void
+_cogl_winsys_onscreen_deinit (CoglOnscreen *onscreen)
+{
+}
+
+static void
+_cogl_winsys_onscreen_bind (CoglOnscreen *onscreen)
+{
+}
+
+static void
+_cogl_winsys_onscreen_swap_buffers (CoglOnscreen *onscreen)
+{
+}
+
+static void
+_cogl_winsys_onscreen_update_swap_throttled (CoglOnscreen *onscreen)
+{
+}
+
+static void
+_cogl_winsys_onscreen_set_visibility (CoglOnscreen *onscreen,
+                                      CoglBool visibility)
+{
+}
+
+const CoglWinsysVtable *
+_cogl_winsys_drm_get_vtable (void)
+{
+  static CoglBool vtable_inited = FALSE;
+  static CoglWinsysVtable vtable;
+
+  /* It would be nice if we could use C99 struct initializers here
+     like the GLX backend does. However this code is more likely to be
+     compiled using Visual Studio which (still!) doesn't support them
+     so we initialize it in code instead */
+
+  if (!vtable_inited)
+    {
+      memset (&vtable, 0, sizeof (vtable));
+
+      vtable.id = COGL_WINSYS_ID_DRM;
+      vtable.name = "DRM";
+      vtable.renderer_get_proc_address = _cogl_winsys_renderer_get_proc_address;
+      vtable.renderer_connect = _cogl_winsys_renderer_connect;
+      vtable.renderer_disconnect = _cogl_winsys_renderer_disconnect;
+      vtable.display_setup = _cogl_winsys_display_setup;
+      vtable.display_destroy = _cogl_winsys_display_destroy;
+      vtable.context_init = _cogl_winsys_context_init;
+      vtable.context_deinit = _cogl_winsys_context_deinit;
+
+      vtable.onscreen_init = _cogl_winsys_onscreen_init;
+      vtable.onscreen_deinit = _cogl_winsys_onscreen_deinit;
+      vtable.onscreen_bind = _cogl_winsys_onscreen_bind;
+      vtable.onscreen_swap_buffers = _cogl_winsys_onscreen_swap_buffers;
+      vtable.onscreen_update_swap_throttled =
+        _cogl_winsys_onscreen_update_swap_throttled;
+      vtable.onscreen_set_visibility = _cogl_winsys_onscreen_set_visibility;
+
+      vtable_inited = TRUE;
+    }
+
+  return &vtable;
+}
diff --git a/configure.ac b/configure.ac
index 59f5ce78..47de343c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -842,6 +842,24 @@ AM_CONDITIONAL(SUPPORT_SDL2, [test "x$SUPPORT_SDL2" = "xyes"])
 AS_IF([test "x$SUPPORT_SDL2" = "xyes" -a "x$SUPPORT_SDL" = "xyes"],
       [AC_MSG_ERROR([The SDL1 and SDL2 winsyses are currently mutually exclusive])])
 
+AC_ARG_ENABLE(
+  [drm],
+  [AC_HELP_STRING([--enable-drm=@<:@no/yes@:>@], [Enable DRM support @<:@default=no@:>@])],
+  [],
+  [enable_drm=no])
+AS_IF([test "x$enable_drm" = "xyes"],
+      [
+        SUPPORT_DRM=yes
+        GL_WINSYS_APIS="$GL_WINSYS_APIS drm"
+        COGL_PKG_REQUIRES="$COGL_PKG_REQUIRES libudev libdrm"
+
+        AC_DEFINE([HAVE_COGL_DRM], 1, [Have DRM support for rendering])
+        COGL_DEFINES_SYMBOLS="$COGL_DEFINES_SYMBOLS COGL_HAS_DRM_SUPPORT"
+      ],
+      [SUPPORT_DRM=no])
+AM_CONDITIONAL(SUPPORT_DRM, [test "x$SUPPORT_DRM" = "xyes"])
+
+
 EGL_PLATFORM_COUNT=0
 
 AC_ARG_ENABLE(
diff --git a/examples/cogl-info.c b/examples/cogl-info.c
index 3eacdc39..9afd4a59 100644
--- a/examples/cogl-info.c
+++ b/examples/cogl-info.c
@@ -144,7 +144,9 @@ get_winsys_name_for_id (CoglWinsysID winsys_id)
     case COGL_WINSYS_ID_WGL:
       return "EGL + Windows WGL platform";
     case COGL_WINSYS_ID_SDL:
-      return "EGL + SDL platform";
+      return "SDL";
+    case COGL_WINSYS_ID_DRM:
+      return "DRM";
     }
   g_return_val_if_reached ("Unknown");
 }
-- 
cgit v1.2.1