124 files changed, 10359 insertions, 46 deletions
diff --git a/Makefile.am b/Makefile.am
index e75a2db..cf57b8f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -30,7 +30,7 @@ if BUILD_I965_DRIVER
 SUBDIRS += i965_drv_video
 endif
 
-pcfiles = libva.pc
+pcfiles = libva.pc libva-tpi.pc
 pcfiles += libva-x11.pc
 if USE_GLX
 pcfiles += libva-glx.pc
@@ -39,6 +39,6 @@ endif
 pkgconfigdir = @pkgconfigdir@
 pkgconfig_DATA = $(pcfiles)
 
-EXTRA_DIST = libva.pc.in libva-x11.pc.in libva-glx.pc.in
+EXTRA_DIST = libva.pc.in libva-tpi.pc.in libva-x11.pc.in libva-glx.pc.in
 
 CLEANFILES = $(pcfiles)
diff --git a/configure.ac b/configure.ac
index 8792dcc..ac54963 100644
--- a/configure.ac
+++ b/configure.ac
@@ -29,7 +29,7 @@ m4_define([libva_version],
           [libva_major_version.libva_minor_version.libva_micro_version])
 
 # if the library source code has changed, increment revision
-m4_define([libva_lt_revision], [4])
+m4_define([libva_lt_revision], [5])
 # if any interface was added/removed/changed, then inc current, reset revision
 m4_define([libva_lt_current], [1])
 # if any interface was added since last public release, then increment age
@@ -163,6 +163,7 @@ AC_OUTPUT([
 	i965_drv_video/shaders/mpeg2/Makefile
 	i965_drv_video/shaders/mpeg2/vld/Makefile
 	i965_drv_video/shaders/render/Makefile
+	i965_drv_video/shaders/post_processing/Makefile
 	test/Makefile
 	test/basic/Makefile
 	test/decode/Makefile
@@ -171,5 +172,6 @@ AC_OUTPUT([
 	libva.pc
 	libva-x11.pc
 	libva-glx.pc
+	libva-tpi.pc
 ])
 
diff --git a/i965_drv_video/Makefile.am b/i965_drv_video/Makefile.am
index 058b525..f32d579 100644
--- a/i965_drv_video/Makefile.am
+++ b/i965_drv_video/Makefile.am
@@ -42,7 +42,8 @@ i965_drv_video_la_SOURCES =	\
 	i965_drv_video.c	\
 	i965_avc_bsd.c		\
 	i965_avc_hw_scoreboard.c\
-	i965_avc_ildb.c
+	i965_avc_ildb.c		\
+	i965_post_processing.c
 
 noinst_HEADERS =                \
 	object_heap.h           \
@@ -59,4 +60,5 @@ noinst_HEADERS =                \
 	i965_structs.h		\
 	i965_avc_bsd.h		\
 	i965_avc_hw_scoreboard.h\
-	i965_avc_ildb.h
+	i965_avc_ildb.h		\
+	i965_post_processing.h
diff --git a/i965_drv_video/i965_defines.h b/i965_drv_video/i965_defines.h
index aa2baa3..839712e 100644
--- a/i965_drv_video/i965_defines.h
+++ b/i965_drv_video/i965_defines.h
@@ -357,6 +357,29 @@
 #define SCOREBOARD_STALLING     0
 #define SCOREBOARD_NON_STALLING 1
 
+#define SURFACE_FORMAT_YCRCB_NORMAL     0
+#define SURFACE_FORMAT_YCRCB_SWAPUVY    1
+#define SURFACE_FORMAT_YCRCB_SWAPUV     2
+#define SURFACE_FORMAT_YCRCB_SWAPY      3
+#define SURFACE_FORMAT_PLANAR_420_8     4
+#define SURFACE_FORMAT_PLANAR_411_8     5
+#define SURFACE_FORMAT_PLANAR_422_8     6
+#define SURFACE_FORMAT_STMM_DN_STATISTICS       7
+#define SURFACE_FORMAT_R10G10B10A2_UNORM        8
+#define SURFACE_FORMAT_R8G8B8A8_UNORM   9
+#define SURFACE_FORMAT_R8B8_UNORM       10
+#define SURFACE_FORMAT_R8_UNORM         11
+#define SURFACE_FORMAT_Y8_UNORM         12
+
+#define AVS_FILTER_ADAPTIVE_8_TAP       0
+#define AVS_FILTER_NEAREST              1
+
+#define IEF_FILTER_COMBO                0
+#define IEF_FILTER_DETAIL               1
+
+#define IEF_FILTER_SIZE_3X3             0
+#define IEF_FILTER_SIZE_5X5             1
+
 #define URB_SIZE(intel)         (IS_IRONLAKE(intel->device_id) ? 1024 : \
                                  IS_G4X(intel->device_id) ? 384 : 256)
 #endif /* _I965_DEFINES_H_ */
diff --git a/i965_drv_video/i965_drv_video.c b/i965_drv_video/i965_drv_video.c
index 104c105..ec5412d 100644
--- a/i965_drv_video/i965_drv_video.c
+++ b/i965_drv_video/i965_drv_video.c
@@ -350,6 +350,8 @@ i965_destroy_surface(struct object_heap *heap, struct object_base *obj)
 
     dri_bo_unreference(obj_surface->bo);
     obj_surface->bo = NULL;
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = NULL;
 
     if (obj_surface->free_private_data != NULL) {
         obj_surface->free_private_data(&obj_surface->private_data);
@@ -395,6 +397,7 @@ i965_CreateSurfaces(VADriverContextP ctx,
         obj_surface->size = SIZE_YUV420(obj_surface->width, obj_surface->height);
         obj_surface->flags = SURFACE_REFERENCED;
         obj_surface->bo = NULL;
+        obj_surface->pp_out_bo = NULL;
         obj_surface->private_data = NULL;
         obj_surface->free_private_data = NULL;
     }
@@ -1644,7 +1647,7 @@ i965_GetImage(VADriverContextP ctx,
 VAStatus 
 i965_PutSurface(VADriverContextP ctx,
                 VASurfaceID surface,
-                Drawable draw, /* X Drawable */
+                void *draw, /* X Drawable */
                 short srcx,
                 short srcy,
                 unsigned short srcw,
@@ -1667,6 +1670,7 @@ i965_PutSurface(VADriverContextP ctx,
     int ret;
     uint32_t name;
     Bool new_region = False;
+    int pp_flag = 0;
     /* Currently don't support DRI1 */
     if (dri_state->driConnectedFlag != VA_DRI2)
         return VA_STATUS_ERROR_UNKNOWN;
@@ -1678,7 +1682,7 @@ i965_PutSurface(VADriverContextP ctx,
     if (obj_surface->bo == NULL)
         return VA_STATUS_SUCCESS;
 
-    dri_drawable = dri_get_drawable(ctx, draw);
+    dri_drawable = dri_get_drawable(ctx, (Drawable)draw);
     assert(dri_drawable);
 
     buffer = dri_get_rendering_buffer(ctx, dri_drawable);
@@ -1716,9 +1720,16 @@ i965_PutSurface(VADriverContextP ctx,
         assert(ret == 0);
     }
 
+    if ((flags & VA_FILTER_SCALING_MASK) == VA_FILTER_SCALING_NL_ANAMORPHIC)
+        pp_flag |= I965_PP_FLAG_AVS;
+
+    if (flags & (VA_BOTTOM_FIELD | VA_TOP_FIELD))
+        pp_flag |= I965_PP_FLAG_DEINTERLACING;
+
     i965_render_put_surface(ctx, surface,
                             srcx, srcy, srcw, srch,
-                            destx, desty, destw, desth);
+                            destx, desty, destw, desth,
+                            pp_flag);
 
     if(obj_surface->subpic != VA_INVALID_ID) {	
 	i965_render_put_subpic(ctx, surface,
diff --git a/i965_drv_video/i965_drv_video.h b/i965_drv_video/i965_drv_video.h
index 8643bd6..7fc9cdb 100644
--- a/i965_drv_video/i965_drv_video.h
+++ b/i965_drv_video/i965_drv_video.h
@@ -109,6 +109,11 @@ struct object_surface
     int orig_height;
     int flags;
     dri_bo *bo;
+    int pp_out_width;
+    int pp_out_height;
+    int orig_pp_out_width;
+    int orig_pp_out_height;
+    dri_bo *pp_out_bo;
     void (*free_private_data)(void **data);
     void *private_data;
 };
diff --git a/i965_drv_video/i965_post_processing.c b/i965_drv_video/i965_post_processing.c
new file mode 100644
index 0000000..633100c
--- /dev/null
+++ b/i965_drv_video/i965_post_processing.c
@@ -0,0 +1,2029 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Xiang Haihao <haihao.xiang@intel.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <va/va_backend.h>
+
+#include "intel_batchbuffer.h"
+#include "intel_driver.h"
+
+#include "i965_defines.h"
+#include "i965_post_processing.h"
+#include "i965_render.h"
+#include "i965_drv_video.h"
+
+struct pp_module
+{
+    /* kernel */
+    char *name;
+    int interface;
+    unsigned int (*bin)[4];
+    int size;
+    dri_bo *bo;
+
+    /* others */
+    void (*initialize)(VADriverContextP ctx, VASurfaceID surface, int input,
+                       unsigned short srcw, unsigned short srch,
+                       unsigned short destw, unsigned short desth);
+};
+
+static uint32_t pp_null_gen5[][4] = {
+#include "shaders/post_processing/null.g4b.gen5"
+};
+
+static uint32_t pp_nv12_load_save_gen5[][4] = {
+#include "shaders/post_processing/nv12_load_save_nv12.g4b.gen5"
+};
+
+static uint32_t pp_nv12_scaling_gen5[][4] = {
+#include "shaders/post_processing/nv12_scaling_nv12.g4b.gen5"
+};
+
+static uint32_t pp_nv12_avs_gen5[][4] = {
+#include "shaders/post_processing/nv12_avs_nv12.g4b.gen5"
+};
+
+static uint32_t pp_nv12_dndi_gen5[][4] = {
+#include "shaders/post_processing/nv12_dndi_nv12.g4b.gen5"
+};
+
+static void ironlake_pp_null_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                        unsigned short srcw, unsigned short srch,
+                                        unsigned short destw, unsigned short desth);
+static void ironlake_pp_nv12_avs_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                            unsigned short srcw, unsigned short srch,
+                                            unsigned short destw, unsigned short desth);
+static void ironlake_pp_nv12_scaling_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                                unsigned short srcw, unsigned short srch,
+                                                unsigned short destw, unsigned short desth);
+static void ironlake_pp_nv12_load_save_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                                  unsigned short srcw, unsigned short srch,
+                                                  unsigned short destw, unsigned short desth);
+static void ironlake_pp_nv12_dndi_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                             unsigned short srcw, unsigned short srch,
+                                             unsigned short destw, unsigned short desth);
+
+static struct pp_module pp_modules_gen5[] = {
+    {
+        "NULL module (for testing)",
+        PP_NULL,
+        pp_null_gen5,
+        sizeof(pp_null_gen5),
+        NULL,
+        ironlake_pp_null_initialize,
+    },
+
+    {
+        "NV12 Load & Save module",
+        PP_NV12_LOAD_SAVE,
+        pp_nv12_load_save_gen5,
+        sizeof(pp_nv12_load_save_gen5),
+        NULL,
+        ironlake_pp_nv12_load_save_initialize,
+    },
+
+    {
+        "NV12 Scaling module",
+        PP_NV12_SCALING,
+        pp_nv12_scaling_gen5,
+        sizeof(pp_nv12_scaling_gen5),
+        NULL,
+        ironlake_pp_nv12_scaling_initialize,
+    },
+
+    {
+        "NV12 AVS module",
+        PP_NV12_AVS,
+        pp_nv12_avs_gen5,
+        sizeof(pp_nv12_avs_gen5),
+        NULL,
+        ironlake_pp_nv12_avs_initialize,
+    },
+
+    {
+        "NV12 DNDI module",
+        PP_NV12_DNDI,
+        pp_nv12_dndi_gen5,
+        sizeof(pp_nv12_dndi_gen5),
+        NULL,
+        ironlake_pp_nv12_dndi_initialize,
+    },
+};
+
+#define NUM_PP_MODULES ARRAY_ELEMS(pp_modules_gen5)
+
+static struct pp_module *pp_modules = NULL;
+
+struct ironlake_pp_static_parameter
+{
+    struct {
+        /* Procamp r1.0 */
+        float procamp_constant_c0;
+        
+        /* Load and Same r1.1 */
+        unsigned int source_packed_y_offset:8;
+        unsigned int source_packed_u_offset:8;
+        unsigned int source_packed_v_offset:8;
+        unsigned int pad0:8;
+
+        union {
+            /* Load and Save r1.2 */
+            struct {
+                unsigned int destination_packed_y_offset:8;
+                unsigned int destination_packed_u_offset:8;
+                unsigned int destination_packed_v_offset:8;
+                unsigned int pad0:8;
+            } load_and_save;
+
+            /* CSC r1.2 */
+            struct {
+                unsigned int destination_rgb_format:8;
+                unsigned int pad0:24;
+            } csc;
+        } r1_2;
+        
+        /* Procamp r1.3 */
+        float procamp_constant_c1;
+
+        /* Procamp r1.4 */
+        float procamp_constant_c2;
+
+        /* DI r1.5 */
+        unsigned int statistics_surface_picth:16;  /* Devided by 2 */
+        unsigned int pad1:16;
+
+        union {
+            /* DI r1.6 */
+            struct {
+                unsigned int pad0:24;
+                unsigned int top_field_first:8;
+            } di;
+
+            /* AVS/Scaling r1.6 */
+            float normalized_video_y_scaling_step;
+        } r1_6;
+
+        /* Procamp r1.7 */
+        float procamp_constant_c5;
+    } grf1;
+    
+    struct {
+        /* Procamp r2.0 */
+        float procamp_constant_c3;
+
+        /* MBZ r2.1*/
+        unsigned int pad0;
+
+        /* WG+CSC r2.2 */
+        float wg_csc_constant_c4;
+
+        /* WG+CSC r2.3 */
+        float wg_csc_constant_c8;
+
+        /* Procamp r2.4 */
+        float procamp_constant_c4;
+
+        /* MBZ r2.5 */
+        unsigned int pad1;
+
+        /* MBZ r2.6 */
+        unsigned int pad2;
+
+        /* WG+CSC r2.7 */
+        float wg_csc_constant_c9;
+    } grf2;
+
+    struct {
+        /* WG+CSC r3.0 */
+        float wg_csc_constant_c0;
+
+        /* Blending r3.1 */
+        float scaling_step_ratio;
+
+        /* Blending r3.2 */
+        float normalized_alpha_y_scaling;
+        
+        /* WG+CSC r3.3 */
+        float wg_csc_constant_c4;
+
+        /* WG+CSC r3.4 */
+        float wg_csc_constant_c1;
+
+        /* ALL r3.5 */
+        int horizontal_origin_offset:16;
+        int vertical_origin_offset:16;
+
+        /* Shared r3.6*/
+        union {
+            /* Color filll */
+            unsigned int color_pixel;
+
+            /* WG+CSC */
+            float wg_csc_constant_c2;
+        } r3_6;
+
+        /* WG+CSC r3.7 */
+        float wg_csc_constant_c3;
+    } grf3;
+
+    struct {
+        /* WG+CSC r4.0 */
+        float wg_csc_constant_c6;
+
+        /* ALL r4.1 MBZ ???*/
+        unsigned int pad0;
+
+        /* Shared r4.2 */
+        union {
+            /* AVS */
+            struct {
+                unsigned int pad1:15;
+                unsigned int nlas:1;
+                unsigned int pad2:16;
+            } avs;
+
+            /* DI */
+            struct {
+                unsigned int motion_history_coefficient_m2:8;
+                unsigned int motion_history_coefficient_m1:8;
+                unsigned int pad0:16;
+            } di;
+        } r4_2;
+
+        /* WG+CSC r4.3 */
+        float wg_csc_constant_c7;
+
+        /* WG+CSC r4.4 */
+        float wg_csc_constant_c10;
+
+        /* AVS r4.5 */
+        float source_video_frame_normalized_horizontal_origin;
+
+        /* MBZ r4.6 */
+        unsigned int pad1;
+
+        /* WG+CSC r4.7 */
+        float wg_csc_constant_c11;
+    } grf4;
+};
+
+struct ironlake_pp_inline_parameter
+{
+    struct {
+        /* ALL r5.0 */
+        int destination_block_horizontal_origin:16;
+        int destination_block_vertical_origin:16;
+
+        /* Shared r5.1 */
+        union {
+            /* AVS/Scaling */
+            float source_surface_block_normalized_horizontal_origin;
+
+            /* FMD */
+            struct {
+                unsigned int variance_surface_vertical_origin:16;
+                unsigned int pad0:16;
+            } fmd;
+        } r5_1; 
+
+        /* AVS/Scaling r5.2 */
+        float source_surface_block_normalized_vertical_origin;
+
+        /* Alpha r5.3 */
+        float alpha_surface_block_normalized_horizontal_origin;
+
+        /* Alpha r5.4 */
+        float alpha_surface_block_normalized_vertical_origin;
+
+        /* Alpha r5.5 */
+        unsigned int alpha_mask_x:16;
+        unsigned int alpha_mask_y:8;
+        unsigned int block_count_x:8;
+
+        /* r5.6 */
+        unsigned int block_horizontal_mask:16;
+        unsigned int block_vertical_mask:8;
+        unsigned int number_blocks:8;
+
+        /* AVS/Scaling r5.7 */
+        float normalized_video_x_scaling_step;
+    } grf5;
+
+    struct {
+        /* AVS r6.0 */
+        float video_step_delta;
+
+        /* r6.1-r6.7 */
+        unsigned int padx[7];
+    } grf6;
+};
+
+static struct ironlake_pp_static_parameter ironlake_pp_static_parameter;
+static struct ironlake_pp_inline_parameter ironlake_pp_inline_parameter;
+
+static void
+ironlake_pp_surface_state(struct i965_post_processing_context *pp_context)
+{
+
+}
+
+static void
+ironlake_pp_interface_descriptor_table(struct i965_post_processing_context *pp_context)
+{
+    struct i965_interface_descriptor *desc;
+    dri_bo *bo;
+    int pp_index = pp_context->current_pp;
+
+    bo = pp_context->idrt.bo;
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    desc = bo->virtual;
+    memset(desc, 0, sizeof(*desc));
+    desc->desc0.grf_reg_blocks = 10;
+    desc->desc0.kernel_start_pointer = pp_modules[pp_index].bo->offset >> 6; /* reloc */
+    desc->desc1.const_urb_entry_read_offset = 0;
+    desc->desc1.const_urb_entry_read_len = 4; /* grf 1-4 */
+    desc->desc2.sampler_state_pointer = pp_context->sampler_state_table.bo->offset >> 5;
+    desc->desc2.sampler_count = 0;
+    desc->desc3.binding_table_entry_count = 0;
+    desc->desc3.binding_table_pointer = 
+        pp_context->binding_table.bo->offset >> 5; /*reloc */
+
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      desc->desc0.grf_reg_blocks,
+                      offsetof(struct i965_interface_descriptor, desc0),
+                      pp_modules[pp_index].bo);
+
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      desc->desc2.sampler_count << 2,
+                      offsetof(struct i965_interface_descriptor, desc2),
+                      pp_context->sampler_state_table.bo);
+
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      desc->desc3.binding_table_entry_count,
+                      offsetof(struct i965_interface_descriptor, desc3),
+                      pp_context->binding_table.bo);
+
+    dri_bo_unmap(bo);
+}
+
+static void
+ironlake_pp_binding_table(struct i965_post_processing_context *pp_context)
+{
+    unsigned int *binding_table;
+    dri_bo *bo = pp_context->binding_table.bo;
+    int i;
+
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    binding_table = bo->virtual;
+    memset(binding_table, 0, bo->size);
+
+    for (i = 0; i < MAX_PP_SURFACES; i++) {
+        if (pp_context->surfaces[i].ss_bo) {
+            assert(pp_context->surfaces[i].s_bo);
+
+            binding_table[i] = pp_context->surfaces[i].ss_bo->offset;
+            dri_bo_emit_reloc(bo,
+                              I915_GEM_DOMAIN_INSTRUCTION, 0,
+                              0,
+                              i * sizeof(*binding_table),
+                              pp_context->surfaces[i].ss_bo);
+        }
+    
+    }
+
+    dri_bo_unmap(bo);
+}
+
+static void
+ironlake_pp_vfe_state(struct i965_post_processing_context *pp_context)
+{
+    struct i965_vfe_state *vfe_state;
+    dri_bo *bo;
+
+    bo = pp_context->vfe_state.bo;
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    vfe_state = bo->virtual;
+    memset(vfe_state, 0, sizeof(*vfe_state));
+    vfe_state->vfe1.max_threads = pp_context->urb.num_vfe_entries - 1;
+    vfe_state->vfe1.urb_entry_alloc_size = pp_context->urb.size_vfe_entry - 1;
+    vfe_state->vfe1.num_urb_entries = pp_context->urb.num_vfe_entries;
+    vfe_state->vfe1.vfe_mode = VFE_GENERIC_MODE;
+    vfe_state->vfe1.children_present = 0;
+    vfe_state->vfe2.interface_descriptor_base = 
+        pp_context->idrt.bo->offset >> 4; /* reloc */
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      0,
+                      offsetof(struct i965_vfe_state, vfe2),
+                      pp_context->idrt.bo);
+    dri_bo_unmap(bo);
+}
+
+static void
+ironlake_pp_upload_constants(struct i965_post_processing_context *pp_context)
+{
+    unsigned char *constant_buffer;
+
+    assert(sizeof(ironlake_pp_static_parameter) == 128);
+    dri_bo_map(pp_context->curbe.bo, 1);
+    assert(pp_context->curbe.bo->virtual);
+    constant_buffer = pp_context->curbe.bo->virtual;
+    memcpy(constant_buffer, &ironlake_pp_static_parameter, sizeof(ironlake_pp_static_parameter));
+    dri_bo_unmap(pp_context->curbe.bo);
+}
+
+static void
+ironlake_pp_states_setup(VADriverContextP ctx)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+
+    ironlake_pp_surface_state(pp_context);
+    ironlake_pp_binding_table(pp_context);
+    ironlake_pp_interface_descriptor_table(pp_context);
+    ironlake_pp_vfe_state(pp_context);
+    ironlake_pp_upload_constants(pp_context);
+}
+
+static void
+ironlake_pp_pipeline_select(VADriverContextP ctx)
+{
+    BEGIN_BATCH(ctx, 1);
+    OUT_BATCH(ctx, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+    ADVANCE_BATCH(ctx);
+}
+
+static void
+ironlake_pp_urb_layout(VADriverContextP ctx, struct i965_post_processing_context *pp_context)
+{
+    unsigned int vfe_fence, cs_fence;
+
+    vfe_fence = pp_context->urb.cs_start;
+    cs_fence = pp_context->urb.size;
+
+    BEGIN_BATCH(ctx, 3);
+    OUT_BATCH(ctx, CMD_URB_FENCE | UF0_VFE_REALLOC | UF0_CS_REALLOC | 1);
+    OUT_BATCH(ctx, 0);
+    OUT_BATCH(ctx, 
+              (vfe_fence << UF2_VFE_FENCE_SHIFT) |      /* VFE_SIZE */
+              (cs_fence << UF2_CS_FENCE_SHIFT));        /* CS_SIZE */
+    ADVANCE_BATCH(ctx);
+}
+
+static void
+ironlake_pp_state_base_address(VADriverContextP ctx)
+{
+    BEGIN_BATCH(ctx, 8);
+    OUT_BATCH(ctx, CMD_STATE_BASE_ADDRESS | 6);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY);
+    ADVANCE_BATCH(ctx);
+}
+
+static void
+ironlake_pp_state_pointers(VADriverContextP ctx, struct i965_post_processing_context *pp_context)
+{
+    BEGIN_BATCH(ctx, 3);
+    OUT_BATCH(ctx, CMD_MEDIA_STATE_POINTERS | 1);
+    OUT_BATCH(ctx, 0);
+    OUT_RELOC(ctx, pp_context->vfe_state.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+    ADVANCE_BATCH(ctx);
+}
+
+static void 
+ironlake_pp_cs_urb_layout(VADriverContextP ctx, struct i965_post_processing_context *pp_context)
+{
+    BEGIN_BATCH(ctx, 2);
+    OUT_BATCH(ctx, CMD_CS_URB_STATE | 0);
+    OUT_BATCH(ctx,
+              ((pp_context->urb.size_cs_entry - 1) << 4) |     /* URB Entry Allocation Size */
+              (pp_context->urb.num_cs_entries << 0));          /* Number of URB Entries */
+    ADVANCE_BATCH(ctx);
+}
+
+static void
+ironlake_pp_constant_buffer(VADriverContextP ctx, struct i965_post_processing_context *pp_context)
+{
+    BEGIN_BATCH(ctx, 2);
+    OUT_BATCH(ctx, CMD_CONSTANT_BUFFER | (1 << 8) | (2 - 2));
+    OUT_RELOC(ctx, pp_context->curbe.bo,
+              I915_GEM_DOMAIN_INSTRUCTION, 0,
+              pp_context->urb.size_cs_entry - 1);
+    ADVANCE_BATCH(ctx);    
+}
+
+static void
+ironlake_pp_object_walker(VADriverContextP ctx, struct i965_post_processing_context *pp_context)
+{
+    int x, x_steps, y, y_steps;
+
+    x_steps = pp_context->pp_x_steps(&pp_context->private_context);
+    y_steps = pp_context->pp_y_steps(&pp_context->private_context);
+
+    for (y = 0; y < y_steps; y++) {
+        for (x = 0; x < x_steps; x++) {
+            if (!pp_context->pp_set_block_parameter(&pp_context->private_context, x, y)) {
+                BEGIN_BATCH(ctx, 20);
+                OUT_BATCH(ctx, CMD_MEDIA_OBJECT | 18);
+                OUT_BATCH(ctx, 0);
+                OUT_BATCH(ctx, 0); /* no indirect data */
+                OUT_BATCH(ctx, 0);
+
+                /* inline data grf 5-6 */
+                assert(sizeof(ironlake_pp_inline_parameter) == 64);
+                intel_batchbuffer_data(ctx, &ironlake_pp_inline_parameter, sizeof(ironlake_pp_inline_parameter));
+
+                ADVANCE_BATCH(ctx);
+            }
+        }
+    }
+}
+
+static void
+ironlake_pp_pipeline_setup(VADriverContextP ctx)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+
+    intel_batchbuffer_start_atomic(ctx, 0x1000);
+    intel_batchbuffer_emit_mi_flush(ctx);
+    ironlake_pp_pipeline_select(ctx);
+    ironlake_pp_state_base_address(ctx);
+    ironlake_pp_state_pointers(ctx, pp_context);
+    ironlake_pp_urb_layout(ctx, pp_context);
+    ironlake_pp_cs_urb_layout(ctx, pp_context);
+    ironlake_pp_constant_buffer(ctx, pp_context);
+    ironlake_pp_object_walker(ctx, pp_context);
+    intel_batchbuffer_end_atomic(ctx);
+}
+
+static int
+pp_null_x_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_null_y_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_null_set_block_parameter(void *private_context, int x, int y)
+{
+    return 0;
+}
+
+static void
+ironlake_pp_null_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                            unsigned short srcw, unsigned short srch,
+                            unsigned short destw, unsigned short desth)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct object_surface *obj_surface;
+
+    /* surface */
+    obj_surface = SURFACE(surface);
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = obj_surface->bo;
+    dri_bo_reference(obj_surface->pp_out_bo);
+    assert(obj_surface->pp_out_bo);
+    obj_surface->pp_out_width = obj_surface->width;
+    obj_surface->pp_out_height = obj_surface->height;
+    obj_surface->orig_pp_out_width = obj_surface->orig_width;
+    obj_surface->orig_pp_out_height = obj_surface->orig_height;
+
+    /* private function & data */
+    pp_context->pp_x_steps = pp_null_x_steps;
+    pp_context->pp_y_steps = pp_null_y_steps;
+    pp_context->pp_set_block_parameter = pp_null_set_block_parameter;
+}
+
+static int
+pp_load_save_x_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_load_save_y_steps(void *private_context)
+{
+    struct pp_load_save_context *pp_load_save_context = private_context;
+
+    return pp_load_save_context->dest_h / 8;
+}
+
+static int
+pp_load_save_set_block_parameter(void *private_context, int x, int y)
+{
+    ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff;
+    ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff;
+    ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16;
+    ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8;
+
+    return 0;
+}
+
+static void
+ironlake_pp_nv12_load_save_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                      unsigned short srcw, unsigned short srch,
+                                      unsigned short destw, unsigned short desth)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct pp_load_save_context *pp_load_save_context = (struct pp_load_save_context *)&pp_context->private_context;
+    struct object_surface *obj_surface;
+    struct i965_surface_state *ss;
+    dri_bo *bo;
+    int index, w, h;
+    int orig_w, orig_h;
+
+    /* surface */
+    obj_surface = SURFACE(surface);
+    orig_w = obj_surface->orig_width;
+    orig_h = obj_surface->orig_height;
+    w = obj_surface->width;
+    h = obj_surface->height;
+
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr,
+                                          "intermediate surface",
+                                          SIZE_YUV420(w, h),
+                                          4096);
+    assert(obj_surface->pp_out_bo);
+    obj_surface->pp_out_width = obj_surface->width;
+    obj_surface->pp_out_height = obj_surface->height;
+    obj_surface->orig_pp_out_width = obj_surface->orig_width;
+    obj_surface->orig_pp_out_height = obj_surface->orig_height;
+
+    /* source Y surface index 1 */
+    index = 1;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = orig_w / 4 - 1;
+    ss->ss2.height = orig_h - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* source UV surface index 2 */
+    index = 2;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss->ss2.width = orig_w / 4 - 1;
+    ss->ss2.height = orig_h / 2 - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      w * h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination Y surface index 7 */
+    index = 7;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = orig_w / 4 - 1;
+    ss->ss2.height = orig_h - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination UV surface index 8 */
+    index = 8;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss->ss2.width = orig_w / 4 - 1;
+    ss->ss2.height = orig_h / 2 - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      w * h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* private function & data */
+    pp_context->pp_x_steps = pp_load_save_x_steps;
+    pp_context->pp_y_steps = pp_load_save_y_steps;
+    pp_context->pp_set_block_parameter = pp_load_save_set_block_parameter;
+    pp_load_save_context->dest_h = h;
+    pp_load_save_context->dest_w = w;
+
+    ironlake_pp_inline_parameter.grf5.block_count_x = w / 16;   /* 1 x N */
+    ironlake_pp_inline_parameter.grf5.number_blocks = w / 16;
+}
+
+static int
+pp_scaling_x_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_scaling_y_steps(void *private_context)
+{
+    struct pp_scaling_context *pp_scaling_context = private_context;
+
+    return pp_scaling_context->dest_h / 8;
+}
+
+static int
+pp_scaling_set_block_parameter(void *private_context, int x, int y)
+{
+    float src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+    float src_y_steping = ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step;
+
+    ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = src_x_steping * x * 16;
+    ironlake_pp_inline_parameter.grf5.source_surface_block_normalized_vertical_origin = src_y_steping * y * 8;
+    ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16;
+    ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8;
+    
+    return 0;
+}
+
+static void
+ironlake_pp_nv12_scaling_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                    unsigned short srcw, unsigned short srch,
+                                    unsigned short destw, unsigned short desth)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct pp_scaling_context *pp_scaling_context = (struct pp_scaling_context *)&pp_context->private_context;
+    struct object_surface *obj_surface;
+    struct i965_sampler_state *sampler_state;
+    struct i965_surface_state *ss;
+    dri_bo *bo;
+    int index;
+    int w, h;
+    int orig_w, orig_h;
+    int pp_out_w, pp_out_h;
+    int orig_pp_out_w, orig_pp_out_h;
+
+    /* surface */
+    obj_surface = SURFACE(surface);
+    orig_w = obj_surface->orig_width;
+    orig_h = obj_surface->orig_height;
+    w = obj_surface->width;
+    h = obj_surface->height;
+
+    orig_pp_out_w = destw;
+    orig_pp_out_h = desth;
+    pp_out_w = ALIGN(orig_pp_out_w, 16);
+    pp_out_h = ALIGN(orig_pp_out_h, 16);
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr,
+                                          "intermediate surface",
+                                          SIZE_YUV420(pp_out_w, pp_out_h),
+                                          4096);
+    assert(obj_surface->pp_out_bo);
+    obj_surface->orig_pp_out_width = orig_pp_out_w;
+    obj_surface->orig_pp_out_height = orig_pp_out_h;
+    obj_surface->pp_out_width = pp_out_w;
+    obj_surface->pp_out_height = pp_out_h;
+
+    /* source Y surface index 1 */
+    index = 1;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = orig_w - 1;
+    ss->ss2.height = orig_h - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* source UV surface index 2 */
+    index = 2;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss->ss2.width = orig_w / 2 - 1;
+    ss->ss2.height = orig_h / 2 - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      w * h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination Y surface index 7 */
+    index = 7;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = pp_out_w / 4 - 1;
+    ss->ss2.height = pp_out_h - 1;
+    ss->ss3.pitch = pp_out_w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination UV surface index 8 */
+    index = 8;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + pp_out_w * pp_out_h;
+    ss->ss2.width = pp_out_w / 4 - 1;
+    ss->ss2.height = pp_out_h / 2 - 1;
+    ss->ss3.pitch = pp_out_w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      pp_out_w * pp_out_h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* sampler state */
+    dri_bo_map(pp_context->sampler_state_table.bo, True);
+    assert(pp_context->sampler_state_table.bo->virtual);
+    sampler_state = pp_context->sampler_state_table.bo->virtual;
+
+    /* SIMD16 Y index 1 */
+    sampler_state[1].ss0.min_filter = I965_MAPFILTER_LINEAR;
+    sampler_state[1].ss0.mag_filter = I965_MAPFILTER_LINEAR;
+    sampler_state[1].ss1.r_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+    sampler_state[1].ss1.s_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+    sampler_state[1].ss1.t_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+
+    /* SIMD16 UV index 2 */
+    sampler_state[2].ss0.min_filter = I965_MAPFILTER_LINEAR;
+    sampler_state[2].ss0.mag_filter = I965_MAPFILTER_LINEAR;
+    sampler_state[2].ss1.r_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+    sampler_state[2].ss1.s_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+    sampler_state[2].ss1.t_wrap_mode = I965_TEXCOORDMODE_CLAMP;
+
+    dri_bo_unmap(pp_context->sampler_state_table.bo);
+
+    /* private function & data */
+    pp_context->pp_x_steps = pp_scaling_x_steps;
+    pp_context->pp_y_steps = pp_scaling_y_steps;
+    pp_context->pp_set_block_parameter = pp_scaling_set_block_parameter;
+
+    pp_scaling_context->dest_w = pp_out_w;
+    pp_scaling_context->dest_h = pp_out_h;
+
+    ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step = (float) 1.0 / pp_out_h;
+    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = (float) 1.0 / pp_out_w;
+    ironlake_pp_inline_parameter.grf5.block_count_x = pp_out_w / 16;   /* 1 x N */
+    ironlake_pp_inline_parameter.grf5.number_blocks = pp_out_w / 16;
+    ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff;
+    ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff;
+}
+
+static int
+pp_avs_x_steps(void *private_context)
+{
+    struct pp_avs_context *pp_avs_context = private_context;
+
+    return pp_avs_context->dest_w / 16;
+}
+
+static int
+pp_avs_y_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_avs_set_block_parameter(void *private_context, int x, int y)
+{
+    struct pp_avs_context *pp_avs_context = private_context;
+    float src_x_steping, src_y_steping, video_step_delta;
+    int tmp_w = ALIGN(pp_avs_context->dest_h * pp_avs_context->src_w / pp_avs_context->src_h, 16);
+
+    if (tmp_w >= pp_avs_context->dest_w) {
+        ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / tmp_w;
+        ironlake_pp_inline_parameter.grf6.video_step_delta = 0;
+        
+        if (x == 0) {
+            ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = (float)(tmp_w - pp_avs_context->dest_w) / tmp_w / 2;
+        } else {
+            src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+            video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta;
+            ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 +
+                16 * 15 * video_step_delta / 2;
+        }
+    } else {
+        int n0, n1, n2, nls_left, nls_right;
+        int factor_a = 5, factor_b = 4;
+        float f;
+
+        n0 = (pp_avs_context->dest_w - tmp_w) / (16 * 2);
+        n1 = (pp_avs_context->dest_w - tmp_w) / 16 - n0;
+        n2 = tmp_w / (16 * factor_a);
+        nls_left = n0 + n2;
+        nls_right = n1 + n2;
+        f = (float) n2 * 16 / tmp_w;
+        
+        if (n0 < 5) {
+            ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0;
+
+            if (x == 0) {
+                ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / pp_avs_context->dest_w;
+                ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = 0.0;
+            } else {
+                src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+                video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta;
+                ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 +
+                    16 * 15 * video_step_delta / 2;
+            }
+        } else {
+            if (x < nls_left) {
+                /* f = a * nls_left * 16 + b * nls_left * 16 * (nls_left * 16 - 1) / 2 */
+                float a = f / (nls_left * 16 * factor_b);
+                float b = (f - nls_left * 16 * a) * 2 / (nls_left * 16 * (nls_left * 16 - 1));
+                
+                ironlake_pp_inline_parameter.grf6.video_step_delta = b;
+
+                if (x == 0) {
+                    ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = 0.0;
+                    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = a;
+                } else {
+                    src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+                    video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta;
+                    ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 +
+                        16 * 15 * video_step_delta / 2;
+                    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step += 16 * b;
+                }
+            } else if (x < (pp_avs_context->dest_w / 16 - nls_right)) {
+                /* scale the center linearly */
+                src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+                video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta;
+                ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 +
+                    16 * 15 * video_step_delta / 2;
+                ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0;
+                ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / tmp_w;
+            } else {
+                float a = f / (nls_right * 16 * factor_b);
+                float b = (f - nls_right * 16 * a) * 2 / (nls_right * 16 * (nls_right * 16 - 1));
+
+                src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step;
+                video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta;
+                ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 +
+                    16 * 15 * video_step_delta / 2;
+                ironlake_pp_inline_parameter.grf6.video_step_delta = -b;
+
+                if (x == (pp_avs_context->dest_w / 16 - nls_right))
+                    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = a + (nls_right * 16  - 1) * b;
+                else
+                    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step -= b * 16;
+            }
+        }
+    }
+
+    src_y_steping = ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step;
+    ironlake_pp_inline_parameter.grf5.source_surface_block_normalized_vertical_origin = src_y_steping * y * 8;
+    ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16;
+    ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8;
+
+    return 0;
+}
+
+static void
+ironlake_pp_nv12_avs_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                unsigned short srcw, unsigned short srch,
+                                unsigned short destw, unsigned short desth)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct pp_avs_context *pp_avs_context = (struct pp_avs_context *)&pp_context->private_context;
+    struct object_surface *obj_surface;
+    struct i965_surface_state *ss;
+    struct i965_sampler_8x8 *sampler_8x8;
+    struct i965_sampler_8x8_state *sampler_8x8_state;
+    struct i965_surface_state2 *ss_8x8;
+    dri_bo *bo;
+    int index;
+    int w, h;
+    int orig_w, orig_h;
+    int pp_out_w, pp_out_h;
+    int orig_pp_out_w, orig_pp_out_h;
+
+    /* surface */
+    obj_surface = SURFACE(surface);
+    
+    if (input == 1) {
+        assert(obj_surface->pp_out_bo);
+        orig_w = obj_surface->orig_pp_out_width;
+        orig_h = obj_surface->orig_pp_out_height;
+        w = obj_surface->pp_out_width;
+        h = obj_surface->pp_out_height;
+    } else {
+        orig_w = obj_surface->orig_width;
+        orig_h = obj_surface->orig_height;
+        w = obj_surface->width;
+        h = obj_surface->height;
+    } 
+    /* source Y surface index 1 */
+    index = 1;
+    pp_context->surfaces[index].s_bo = (input == 1 ? obj_surface->pp_out_bo : obj_surface->bo);
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "Y surface state for sample_8x8", 
+                      sizeof(struct i965_surface_state2), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss_8x8 = bo->virtual;
+    memset(ss_8x8, 0, sizeof(*ss_8x8));
+    ss_8x8->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset;
+    ss_8x8->ss1.cbcr_pixel_offset_v_direction = 0;
+    ss_8x8->ss1.width = orig_w - 1;
+    ss_8x8->ss1.height = orig_h - 1;
+    ss_8x8->ss2.half_pitch_for_chroma = 0;
+    ss_8x8->ss2.pitch = w - 1;
+    ss_8x8->ss2.interleave_chroma = 0;
+    ss_8x8->ss2.surface_format = SURFACE_FORMAT_Y8_UNORM;
+    ss_8x8->ss3.x_offset_for_cb = 0;
+    ss_8x8->ss3.y_offset_for_cb = 0;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      offsetof(struct i965_surface_state2, ss0),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* source UV surface index 2 */
+    index = 2;
+    pp_context->surfaces[index].s_bo = (input == 1 ? obj_surface->pp_out_bo : obj_surface->bo);
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "UV surface state for sample_8x8", 
+                      sizeof(struct i965_surface_state2), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss_8x8 = bo->virtual;
+    memset(ss_8x8, 0, sizeof(*ss_8x8));
+    ss_8x8->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss_8x8->ss1.cbcr_pixel_offset_v_direction = 0;
+    ss_8x8->ss1.width = orig_w / 2 - 1;
+    ss_8x8->ss1.height = orig_h / 2 - 1;
+    ss_8x8->ss2.half_pitch_for_chroma = 0;
+    ss_8x8->ss2.pitch = w - 1;
+    ss_8x8->ss2.interleave_chroma = 0;
+    ss_8x8->ss2.surface_format = SURFACE_FORMAT_R8B8_UNORM;
+    ss_8x8->ss3.x_offset_for_cb = 0;
+    ss_8x8->ss3.y_offset_for_cb = 0;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      w * h,
+                      offsetof(struct i965_surface_state2, ss0),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    orig_pp_out_w = destw;
+    orig_pp_out_h = desth;
+    pp_out_w = ALIGN(orig_pp_out_w, 16);
+    pp_out_h = ALIGN(orig_pp_out_h, 16);
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr,
+                                          "intermediate surface",
+                                          SIZE_YUV420(pp_out_w, pp_out_h),
+                                          4096);
+    assert(obj_surface->pp_out_bo);
+    obj_surface->orig_pp_out_width = orig_pp_out_w;
+    obj_surface->orig_pp_out_height = orig_pp_out_h;
+    obj_surface->pp_out_width = pp_out_w;
+    obj_surface->pp_out_height = pp_out_h;
+
+    /* destination Y surface index 7 */
+    index = 7;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = pp_out_w / 4 - 1;
+    ss->ss2.height = pp_out_h - 1;
+    ss->ss3.pitch = pp_out_w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination UV surface index 8 */
+    index = 8;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + pp_out_w * pp_out_h;
+    ss->ss2.width = pp_out_w / 4 - 1;
+    ss->ss2.height = pp_out_h / 2 - 1;
+    ss->ss3.pitch = pp_out_w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      pp_out_w * pp_out_h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+    
+    /* sampler 8x8 state */
+    dri_bo_map(pp_context->sampler_state_table.bo_8x8, True);
+    assert(pp_context->sampler_state_table.bo_8x8->virtual);
+    assert(sizeof(*sampler_8x8_state) == sizeof(int) * 138);
+    sampler_8x8_state = pp_context->sampler_state_table.bo_8x8->virtual;
+    memset(sampler_8x8_state, 0, sizeof(*sampler_8x8_state));
+    sampler_8x8_state->dw136.default_sharpness_level = 0;
+    sampler_8x8_state->dw137.adaptive_filter_for_all_channel = 1;
+    sampler_8x8_state->dw137.bypass_y_adaptive_filtering = 1;
+    sampler_8x8_state->dw137.bypass_x_adaptive_filtering = 1;
+    dri_bo_unmap(pp_context->sampler_state_table.bo_8x8);
+
+    /* sampler 8x8 */
+    dri_bo_map(pp_context->sampler_state_table.bo, True);
+    assert(pp_context->sampler_state_table.bo->virtual);
+    assert(sizeof(*sampler_8x8) == sizeof(int) * 16);
+    sampler_8x8 = pp_context->sampler_state_table.bo->virtual;
+
+    /* sample_8x8 Y index 1 */
+    index = 1;
+    memset(&sampler_8x8[index], 0, sizeof(*sampler_8x8));
+    sampler_8x8[index].dw0.avs_filter_type = AVS_FILTER_ADAPTIVE_8_TAP;
+    sampler_8x8[index].dw0.ief_bypass = 0;
+    sampler_8x8[index].dw0.ief_filter_type = IEF_FILTER_DETAIL;
+    sampler_8x8[index].dw0.ief_filter_size = IEF_FILTER_SIZE_5X5;
+    sampler_8x8[index].dw1.sampler_8x8_state_pointer = pp_context->sampler_state_table.bo_8x8->offset >> 5;
+    sampler_8x8[index].dw2.global_noise_estimation = 22;
+    sampler_8x8[index].dw2.strong_edge_threshold = 8;
+    sampler_8x8[index].dw2.weak_edge_threshold = 1;
+    sampler_8x8[index].dw3.strong_edge_weight = 7;
+    sampler_8x8[index].dw3.regular_weight = 2;
+    sampler_8x8[index].dw3.non_edge_weight = 0;
+    sampler_8x8[index].dw3.gain_factor = 40;
+    sampler_8x8[index].dw4.steepness_boost = 0;
+    sampler_8x8[index].dw4.steepness_threshold = 0;
+    sampler_8x8[index].dw4.mr_boost = 0;
+    sampler_8x8[index].dw4.mr_threshold = 5;
+    sampler_8x8[index].dw5.pwl1_point_1 = 4;
+    sampler_8x8[index].dw5.pwl1_point_2 = 12;
+    sampler_8x8[index].dw5.pwl1_point_3 = 16;
+    sampler_8x8[index].dw5.pwl1_point_4 = 26;
+    sampler_8x8[index].dw6.pwl1_point_5 = 40;
+    sampler_8x8[index].dw6.pwl1_point_6 = 160;
+    sampler_8x8[index].dw6.pwl1_r3_bias_0 = 127;
+    sampler_8x8[index].dw6.pwl1_r3_bias_1 = 98;
+    sampler_8x8[index].dw7.pwl1_r3_bias_2 = 88;
+    sampler_8x8[index].dw7.pwl1_r3_bias_3 = 64;
+    sampler_8x8[index].dw7.pwl1_r3_bias_4 = 44;
+    sampler_8x8[index].dw7.pwl1_r3_bias_5 = 0;
+    sampler_8x8[index].dw8.pwl1_r3_bias_6 = 0;
+    sampler_8x8[index].dw8.pwl1_r5_bias_0 = 3;
+    sampler_8x8[index].dw8.pwl1_r5_bias_1 = 32;
+    sampler_8x8[index].dw8.pwl1_r5_bias_2 = 32;
+    sampler_8x8[index].dw9.pwl1_r5_bias_3 = 58;
+    sampler_8x8[index].dw9.pwl1_r5_bias_4 = 100;
+    sampler_8x8[index].dw9.pwl1_r5_bias_5 = 108;
+    sampler_8x8[index].dw9.pwl1_r5_bias_6 = 88;
+    sampler_8x8[index].dw10.pwl1_r3_slope_0 = -116;
+    sampler_8x8[index].dw10.pwl1_r3_slope_1 = -20;
+    sampler_8x8[index].dw10.pwl1_r3_slope_2 = -96;
+    sampler_8x8[index].dw10.pwl1_r3_slope_3 = -32;
+    sampler_8x8[index].dw11.pwl1_r3_slope_4 = -50;
+    sampler_8x8[index].dw11.pwl1_r3_slope_5 = 0;
+    sampler_8x8[index].dw11.pwl1_r3_slope_6 = 0;
+    sampler_8x8[index].dw11.pwl1_r5_slope_0 = 116;
+    sampler_8x8[index].dw12.pwl1_r5_slope_1 = 0;
+    sampler_8x8[index].dw12.pwl1_r5_slope_2 = 114;
+    sampler_8x8[index].dw12.pwl1_r5_slope_3 = 67;
+    sampler_8x8[index].dw12.pwl1_r5_slope_4 = 9;
+    sampler_8x8[index].dw13.pwl1_r5_slope_5 = -3;
+    sampler_8x8[index].dw13.pwl1_r5_slope_6 = -15;
+    sampler_8x8[index].dw13.limiter_boost = 0;
+    sampler_8x8[index].dw13.minimum_limiter = 10;
+    sampler_8x8[index].dw13.maximum_limiter = 11;
+    sampler_8x8[index].dw14.clip_limiter = 130;
+    dri_bo_emit_reloc(pp_context->sampler_state_table.bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      sizeof(*sampler_8x8) * index + offsetof(struct i965_sampler_8x8, dw1),
+                      pp_context->sampler_state_table.bo_8x8);
+
+    dri_bo_map(pp_context->sampler_state_table.bo_8x8_uv, True);
+    assert(pp_context->sampler_state_table.bo_8x8_uv->virtual);
+    assert(sizeof(*sampler_8x8_state) == sizeof(int) * 138);
+    sampler_8x8_state = pp_context->sampler_state_table.bo_8x8_uv->virtual;
+    memset(sampler_8x8_state, 0, sizeof(*sampler_8x8_state));
+    sampler_8x8_state->dw136.default_sharpness_level = 0;
+    sampler_8x8_state->dw137.adaptive_filter_for_all_channel = 0;
+    sampler_8x8_state->dw137.bypass_y_adaptive_filtering = 1;
+    sampler_8x8_state->dw137.bypass_x_adaptive_filtering = 1;
+    dri_bo_unmap(pp_context->sampler_state_table.bo_8x8_uv);
+
+    /* sample_8x8 UV index 2 */
+    index = 2;
+    memset(&sampler_8x8[index], 0, sizeof(*sampler_8x8));
+    sampler_8x8[index].dw0.avs_filter_type = AVS_FILTER_NEAREST;
+    sampler_8x8[index].dw0.ief_bypass = 0;
+    sampler_8x8[index].dw0.ief_filter_type = IEF_FILTER_DETAIL;
+    sampler_8x8[index].dw0.ief_filter_size = IEF_FILTER_SIZE_5X5;
+    sampler_8x8[index].dw1.sampler_8x8_state_pointer = pp_context->sampler_state_table.bo_8x8_uv->offset >> 5;
+    sampler_8x8[index].dw2.global_noise_estimation = 22;
+    sampler_8x8[index].dw2.strong_edge_threshold = 8;
+    sampler_8x8[index].dw2.weak_edge_threshold = 1;
+    sampler_8x8[index].dw3.strong_edge_weight = 7;
+    sampler_8x8[index].dw3.regular_weight = 2;
+    sampler_8x8[index].dw3.non_edge_weight = 0;
+    sampler_8x8[index].dw3.gain_factor = 40;
+    sampler_8x8[index].dw4.steepness_boost = 0;
+    sampler_8x8[index].dw4.steepness_threshold = 0;
+    sampler_8x8[index].dw4.mr_boost = 0;
+    sampler_8x8[index].dw4.mr_threshold = 5;
+    sampler_8x8[index].dw5.pwl1_point_1 = 4;
+    sampler_8x8[index].dw5.pwl1_point_2 = 12;
+    sampler_8x8[index].dw5.pwl1_point_3 = 16;
+    sampler_8x8[index].dw5.pwl1_point_4 = 26;
+    sampler_8x8[index].dw6.pwl1_point_5 = 40;
+    sampler_8x8[index].dw6.pwl1_point_6 = 160;
+    sampler_8x8[index].dw6.pwl1_r3_bias_0 = 127;
+    sampler_8x8[index].dw6.pwl1_r3_bias_1 = 98;
+    sampler_8x8[index].dw7.pwl1_r3_bias_2 = 88;
+    sampler_8x8[index].dw7.pwl1_r3_bias_3 = 64;
+    sampler_8x8[index].dw7.pwl1_r3_bias_4 = 44;
+    sampler_8x8[index].dw7.pwl1_r3_bias_5 = 0;
+    sampler_8x8[index].dw8.pwl1_r3_bias_6 = 0;
+    sampler_8x8[index].dw8.pwl1_r5_bias_0 = 3;
+    sampler_8x8[index].dw8.pwl1_r5_bias_1 = 32;
+    sampler_8x8[index].dw8.pwl1_r5_bias_2 = 32;
+    sampler_8x8[index].dw9.pwl1_r5_bias_3 = 58;
+    sampler_8x8[index].dw9.pwl1_r5_bias_4 = 100;
+    sampler_8x8[index].dw9.pwl1_r5_bias_5 = 108;
+    sampler_8x8[index].dw9.pwl1_r5_bias_6 = 88;
+    sampler_8x8[index].dw10.pwl1_r3_slope_0 = -116;
+    sampler_8x8[index].dw10.pwl1_r3_slope_1 = -20;
+    sampler_8x8[index].dw10.pwl1_r3_slope_2 = -96;
+    sampler_8x8[index].dw10.pwl1_r3_slope_3 = -32;
+    sampler_8x8[index].dw11.pwl1_r3_slope_4 = -50;
+    sampler_8x8[index].dw11.pwl1_r3_slope_5 = 0;
+    sampler_8x8[index].dw11.pwl1_r3_slope_6 = 0;
+    sampler_8x8[index].dw11.pwl1_r5_slope_0 = 116;
+    sampler_8x8[index].dw12.pwl1_r5_slope_1 = 0;
+    sampler_8x8[index].dw12.pwl1_r5_slope_2 = 114;
+    sampler_8x8[index].dw12.pwl1_r5_slope_3 = 67;
+    sampler_8x8[index].dw12.pwl1_r5_slope_4 = 9;
+    sampler_8x8[index].dw13.pwl1_r5_slope_5 = -3;
+    sampler_8x8[index].dw13.pwl1_r5_slope_6 = -15;
+    sampler_8x8[index].dw13.limiter_boost = 0;
+    sampler_8x8[index].dw13.minimum_limiter = 10;
+    sampler_8x8[index].dw13.maximum_limiter = 11;
+    sampler_8x8[index].dw14.clip_limiter = 130;
+    dri_bo_emit_reloc(pp_context->sampler_state_table.bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      sizeof(*sampler_8x8) * index + offsetof(struct i965_sampler_8x8, dw1),
+                      pp_context->sampler_state_table.bo_8x8_uv);
+
+    dri_bo_unmap(pp_context->sampler_state_table.bo);
+
+    /* private function & data */
+    pp_context->pp_x_steps = pp_avs_x_steps;
+    pp_context->pp_y_steps = pp_avs_y_steps;
+    pp_context->pp_set_block_parameter = pp_avs_set_block_parameter;
+
+    pp_avs_context->dest_w = pp_out_w;
+    pp_avs_context->dest_h = pp_out_h;
+    pp_avs_context->src_w = w;
+    pp_avs_context->src_h = h;
+
+    ironlake_pp_static_parameter.grf4.r4_2.avs.nlas = 1;
+    ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step = (float) 1.0 / pp_out_h;
+    ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = (float) 1.0 / pp_out_w;
+    ironlake_pp_inline_parameter.grf5.block_count_x = 1;        /* M x 1 */
+    ironlake_pp_inline_parameter.grf5.number_blocks = pp_out_h / 8;
+    ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff;
+    ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff;
+    ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0;
+}
+
+static int
+pp_dndi_x_steps(void *private_context)
+{
+    return 1;
+}
+
+static int
+pp_dndi_y_steps(void *private_context)
+{
+    struct pp_dndi_context *pp_dndi_context = private_context;
+
+    return pp_dndi_context->dest_h / 4;
+}
+
+static int
+pp_dndi_set_block_parameter(void *private_context, int x, int y)
+{
+    ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16;
+    ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 4;
+
+    return 0;
+}
+
+static 
+void ironlake_pp_nv12_dndi_initialize(VADriverContextP ctx, VASurfaceID surface, int input,
+                                      unsigned short srcw, unsigned short srch,
+                                      unsigned short destw, unsigned short desth)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct pp_dndi_context *pp_dndi_context = (struct pp_dndi_context *)&pp_context->private_context;
+    struct object_surface *obj_surface;
+    struct i965_surface_state *ss;
+    struct i965_surface_state2 *ss_dndi;
+    struct i965_sampler_dndi *sampler_dndi;
+    dri_bo *bo;
+    int index;
+    int w, h;
+    int orig_w, orig_h;
+
+    /* surface */
+    obj_surface = SURFACE(surface);
+    orig_w = obj_surface->orig_width;
+    orig_h = obj_surface->orig_height;
+    w = obj_surface->width;
+    h = obj_surface->height;
+
+    if (pp_context->stmm.bo == NULL) {
+        pp_context->stmm.bo = dri_bo_alloc(i965->intel.bufmgr,
+                                           "STMM surface",
+                                           w * h,
+                                           4096);
+        assert(pp_context->stmm.bo);
+    }
+
+    dri_bo_unreference(obj_surface->pp_out_bo);
+    obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr,
+                                          "intermediate surface",
+                                          SIZE_YUV420(w, h),
+                                          4096);
+    assert(obj_surface->pp_out_bo);
+    obj_surface->orig_pp_out_width = orig_w;
+    obj_surface->orig_pp_out_height = orig_h;
+    obj_surface->pp_out_width = w;
+    obj_surface->pp_out_height = h;
+
+    /* source UV surface index 2 */
+    index = 2;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss->ss2.width = orig_w / 4 - 1;
+    ss->ss2.height = orig_h / 2 - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      w * h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* source YUV surface index 4 */
+    index = 4;
+    pp_context->surfaces[index].s_bo = obj_surface->bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "YUV surface state for deinterlace ", 
+                      sizeof(struct i965_surface_state2), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss_dndi = bo->virtual;
+    memset(ss_dndi, 0, sizeof(*ss_dndi));
+    ss_dndi->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset;
+    ss_dndi->ss1.cbcr_pixel_offset_v_direction = 0;
+    ss_dndi->ss1.width = w - 1;
+    ss_dndi->ss1.height = h - 1;
+    ss_dndi->ss1.cbcr_pixel_offset_v_direction = 1;
+    ss_dndi->ss2.half_pitch_for_chroma = 0;
+    ss_dndi->ss2.pitch = w - 1;
+    ss_dndi->ss2.interleave_chroma = 1;
+    ss_dndi->ss2.surface_format = SURFACE_FORMAT_PLANAR_420_8;
+    ss_dndi->ss2.half_pitch_for_chroma = 0;
+    ss_dndi->ss2.tiled_surface = 0;
+    ss_dndi->ss3.x_offset_for_cb = 0;
+    ss_dndi->ss3.y_offset_for_cb = h;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      0,
+                      0,
+                      offsetof(struct i965_surface_state2, ss0),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* source STMM surface index 20 */
+    index = 20;
+    pp_context->surfaces[index].s_bo = pp_context->stmm.bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "STMM surface state for deinterlace ", 
+                      sizeof(struct i965_surface_state2), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = w - 1;
+    ss->ss2.height = h - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination Y surface index 7 */
+    index = 7;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset;
+    ss->ss2.width = w / 4 - 1;
+    ss->ss2.height = h - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* destination UV surface index 8 */
+    index = 8;
+    pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo;
+    dri_bo_reference(pp_context->surfaces[index].s_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "surface state", 
+                      sizeof(struct i965_surface_state), 
+                      4096);
+    assert(bo);
+    pp_context->surfaces[index].ss_bo = bo;
+    dri_bo_map(bo, True);
+    assert(bo->virtual);
+    ss = bo->virtual;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_2D;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM;
+    ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h;
+    ss->ss2.width = w / 4 - 1;
+    ss->ss2.height = h / 2 - 1;
+    ss->ss3.pitch = w - 1;
+    dri_bo_emit_reloc(bo,
+                      I915_GEM_DOMAIN_RENDER, 
+                      I915_GEM_DOMAIN_RENDER,
+                      w * h,
+                      offsetof(struct i965_surface_state, ss1),
+                      pp_context->surfaces[index].s_bo);
+    dri_bo_unmap(bo);
+
+    /* sampler dndi */
+    dri_bo_map(pp_context->sampler_state_table.bo, True);
+    assert(pp_context->sampler_state_table.bo->virtual);
+    assert(sizeof(*sampler_dndi) == sizeof(int) * 8);
+    sampler_dndi = pp_context->sampler_state_table.bo->virtual;
+
+    /* sample dndi index 1 */
+    index = 0;
+    sampler_dndi[index].dw0.denoise_asd_threshold = 0;
+    sampler_dndi[index].dw0.denoise_history_delta = 8;          // 0-15, default is 8
+    sampler_dndi[index].dw0.denoise_maximum_history = 128;      // 128-240
+    sampler_dndi[index].dw0.denoise_stad_threshold = 0;
+
+    sampler_dndi[index].dw1.denoise_threshold_for_sum_of_complexity_measure = 64;
+    sampler_dndi[index].dw1.denoise_moving_pixel_threshold = 0;
+    sampler_dndi[index].dw1.stmm_c2 = 0;
+    sampler_dndi[index].dw1.low_temporal_difference_threshold = 8;
+    sampler_dndi[index].dw1.temporal_difference_threshold = 16;
+
+    sampler_dndi[index].dw2.block_noise_estimate_noise_threshold = 15;   // 0-31
+    sampler_dndi[index].dw2.block_noise_estimate_edge_threshold = 7;    // 0-15
+    sampler_dndi[index].dw2.denoise_edge_threshold = 7;                 // 0-15
+    sampler_dndi[index].dw2.good_neighbor_threshold = 7;                // 0-63
+
+    sampler_dndi[index].dw3.maximum_stmm = 128;
+    sampler_dndi[index].dw3.multipler_for_vecm = 2;
+    sampler_dndi[index].dw3.blending_constant_across_time_for_small_values_of_stmm = 0;
+    sampler_dndi[index].dw3.blending_constant_across_time_for_large_values_of_stmm = 64;
+    sampler_dndi[index].dw3.stmm_blending_constant_select = 0;
+
+    sampler_dndi[index].dw4.sdi_delta = 8;
+    sampler_dndi[index].dw4.sdi_threshold = 128;
+    sampler_dndi[index].dw4.stmm_output_shift = 7;                      // stmm_max - stmm_min = 2 ^ stmm_output_shift
+    sampler_dndi[index].dw4.stmm_shift_up = 0;
+    sampler_dndi[index].dw4.stmm_shift_down = 0;
+    sampler_dndi[index].dw4.minimum_stmm = 0;
+
+    sampler_dndi[index].dw5.fmd_temporal_difference_threshold = 0;
+    sampler_dndi[index].dw5.sdi_fallback_mode_2_constant = 0;
+    sampler_dndi[index].dw5.sdi_fallback_mode_1_t2_constant = 0;
+    sampler_dndi[index].dw5.sdi_fallback_mode_1_t1_constant = 0;
+
+    sampler_dndi[index].dw6.dn_enable = 1;
+    sampler_dndi[index].dw6.di_enable = 1;
+    sampler_dndi[index].dw6.di_partial = 0;
+    sampler_dndi[index].dw6.dndi_top_first = 1;
+    sampler_dndi[index].dw6.dndi_stream_id = 1;
+    sampler_dndi[index].dw6.dndi_first_frame = 1;
+    sampler_dndi[index].dw6.progressive_dn = 0;
+    sampler_dndi[index].dw6.fmd_tear_threshold = 32;
+    sampler_dndi[index].dw6.fmd2_vertical_difference_threshold = 32;
+    sampler_dndi[index].dw6.fmd1_vertical_difference_threshold = 32;
+
+    sampler_dndi[index].dw7.fmd_for_1st_field_of_current_frame = 2;
+    sampler_dndi[index].dw7.fmd_for_2nd_field_of_previous_frame = 1;
+    sampler_dndi[index].dw7.vdi_walker_enable = 0;
+    sampler_dndi[index].dw7.column_width_minus1 = w / 16;
+
+    dri_bo_unmap(pp_context->sampler_state_table.bo);
+
+    /* private function & data */
+    pp_context->pp_x_steps = pp_dndi_x_steps;
+    pp_context->pp_y_steps = pp_dndi_y_steps;
+    pp_context->pp_set_block_parameter = pp_dndi_set_block_parameter;
+
+    ironlake_pp_static_parameter.grf1.statistics_surface_picth = w / 2;
+    ironlake_pp_static_parameter.grf1.r1_6.di.top_field_first = 0;
+    ironlake_pp_static_parameter.grf4.r4_2.di.motion_history_coefficient_m2 = 64;
+    ironlake_pp_static_parameter.grf4.r4_2.di.motion_history_coefficient_m1 = 192;
+
+    ironlake_pp_inline_parameter.grf5.block_count_x = w / 16;   /* 1 x N */
+    ironlake_pp_inline_parameter.grf5.number_blocks = w / 16;
+    ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff;
+    ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff;
+
+    pp_dndi_context->dest_w = w;
+    pp_dndi_context->dest_h = h;
+}
+
+static void
+ironlake_pp_initialize(VADriverContextP ctx,
+                       VASurfaceID surface,
+                       int input,
+                       short srcx,
+                       short srcy,
+                       unsigned short srcw,
+                       unsigned short srch,
+                       short destx,
+                       short desty,
+                       unsigned short destw,
+                       unsigned short desth,
+                       int pp_index)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    struct pp_module *pp_module;
+    dri_bo *bo;
+    int i;
+
+    dri_bo_unreference(pp_context->curbe.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "constant buffer",
+                      4096, 
+                      4096);
+    assert(bo);
+    pp_context->curbe.bo = bo;
+
+    dri_bo_unreference(pp_context->binding_table.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "binding table",
+                      sizeof(unsigned int), 
+                      4096);
+    assert(bo);
+    pp_context->binding_table.bo = bo;
+
+    dri_bo_unreference(pp_context->idrt.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "interface discriptor", 
+                      sizeof(struct i965_interface_descriptor), 
+                      4096);
+    assert(bo);
+    pp_context->idrt.bo = bo;
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "sampler state table", 
+                      4096,
+                      4096);
+    assert(bo);
+    dri_bo_map(bo, True);
+    memset(bo->virtual, 0, bo->size);
+    dri_bo_unmap(bo);
+    pp_context->sampler_state_table.bo = bo;
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo_8x8);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "sampler 8x8 state ",
+                      4096,
+                      4096);
+    assert(bo);
+    pp_context->sampler_state_table.bo_8x8 = bo;
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo_8x8_uv);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "sampler 8x8 state ",
+                      4096,
+                      4096);
+    assert(bo);
+    pp_context->sampler_state_table.bo_8x8_uv = bo;
+
+    dri_bo_unreference(pp_context->vfe_state.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr, 
+                      "vfe state", 
+                      sizeof(struct i965_vfe_state), 
+                      4096);
+    assert(bo);
+    pp_context->vfe_state.bo = bo;
+    
+    for (i = 0; i < MAX_PP_SURFACES; i++) {
+        dri_bo_unreference(pp_context->surfaces[i].ss_bo);
+        pp_context->surfaces[i].ss_bo = NULL;
+
+        dri_bo_unreference(pp_context->surfaces[i].s_bo);
+        pp_context->surfaces[i].s_bo = NULL;
+    }
+
+    memset(&ironlake_pp_static_parameter, 0, sizeof(ironlake_pp_static_parameter));
+    memset(&ironlake_pp_inline_parameter, 0, sizeof(ironlake_pp_inline_parameter));
+    assert(pp_index >= PP_NULL && pp_index < NUM_PP_MODULES);
+    assert(pp_modules);
+    pp_context->current_pp = pp_index;
+    pp_module = &pp_modules[pp_index];
+    
+    if (pp_module->initialize)
+        pp_module->initialize(ctx, surface, input, srcw, srch, destw, desth);
+}
+
+static void
+i965_post_processing_internal(VADriverContextP ctx,
+                              VASurfaceID surface,
+                              int input,
+                              short srcx,
+                              short srcy,
+                              unsigned short srcw,
+                              unsigned short srch,
+                              short destx,
+                              short desty,
+                              unsigned short destw,
+                              unsigned short desth,
+                              int pp_index)
+{
+    ironlake_pp_initialize(ctx, surface, input,
+                           srcx, srcy, srcw, srch,
+                           destx, desty, destw, desth,
+                           pp_index);
+    ironlake_pp_states_setup(ctx);
+    ironlake_pp_pipeline_setup(ctx);
+}
+
+void
+i965_post_processing(VADriverContextP ctx,
+                     VASurfaceID surface,
+                     short srcx,
+                     short srcy,
+                     unsigned short srcw,
+                     unsigned short srch,
+                     short destx,
+                     short desty,
+                     unsigned short destw,
+                     unsigned short desth,
+                     unsigned int flag)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+
+    if (IS_IRONLAKE(i965->intel.device_id)) {
+        /* Currently only support post processing for NV12 surface */
+        if (i965->render_state.interleaved_uv) {
+            int input = 0;
+
+            if (flag & I965_PP_FLAG_DEINTERLACING) {
+                i965_post_processing_internal(ctx, surface, input,
+                                              srcx, srcy, srcw, srch,
+                                              destx, desty, destw, desth,
+                                              PP_NV12_DNDI);
+                input = 1;
+            }
+
+            if (flag & I965_PP_FLAG_AVS) {
+                i965_post_processing_internal(ctx, surface, input,
+                                              srcx, srcy, srcw, srch,
+                                              destx, desty, destw, desth,
+                                              PP_NV12_AVS);
+            }
+        }
+    }
+}       
+
+void
+i965_post_processing_once_init(VADriverContextP ctx)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    int i;
+
+    pp_context->urb.size = URB_SIZE((&i965->intel));
+    pp_context->urb.num_vfe_entries = 32;
+    pp_context->urb.size_vfe_entry = 1;
+    pp_context->urb.num_cs_entries = 1;
+    pp_context->urb.size_cs_entry = 2;
+    pp_context->urb.vfe_start = 0;
+    pp_context->urb.cs_start = pp_context->urb.vfe_start + 
+        pp_context->urb.num_vfe_entries * pp_context->urb.size_vfe_entry;
+    assert(pp_context->urb.cs_start + 
+           pp_context->urb.num_cs_entries * pp_context->urb.size_cs_entry <= URB_SIZE((&i965->intel)));
+
+    if (IS_IRONLAKE(i965->intel.device_id)) {
+        pp_modules = pp_modules_gen5;
+    }
+
+    for (i = 0; i < NUM_PP_MODULES && pp_modules; i++) {
+        struct pp_module *pp_module = &pp_modules[i];
+        pp_module->bo = dri_bo_alloc(i965->intel.bufmgr,
+                                     pp_module->name,
+                                     pp_module->size,
+                                     4096);
+        assert(pp_module->bo);
+        dri_bo_subdata(pp_module->bo, 0, pp_module->size, pp_module->bin);
+    }
+}
+
+Bool
+i965_post_processing_terminate(VADriverContextP ctx)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct i965_post_processing_context *pp_context = &i965->render_state.pp_context;
+    int i;
+
+    dri_bo_unreference(pp_context->curbe.bo);
+    pp_context->curbe.bo = NULL;
+
+    for (i = 0; i < MAX_PP_SURFACES; i++) {
+        dri_bo_unreference(pp_context->surfaces[i].ss_bo);
+        pp_context->surfaces[i].ss_bo = NULL;
+
+        dri_bo_unreference(pp_context->surfaces[i].s_bo);
+        pp_context->surfaces[i].s_bo = NULL;
+    }
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo);
+    pp_context->sampler_state_table.bo = NULL;
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo_8x8);
+    pp_context->sampler_state_table.bo_8x8 = NULL;
+
+    dri_bo_unreference(pp_context->sampler_state_table.bo_8x8_uv);
+    pp_context->sampler_state_table.bo_8x8_uv = NULL;
+
+    dri_bo_unreference(pp_context->binding_table.bo);
+    pp_context->binding_table.bo = NULL;
+
+    dri_bo_unreference(pp_context->idrt.bo);
+    pp_context->idrt.bo = NULL;
+
+    dri_bo_unreference(pp_context->vfe_state.bo);
+    pp_context->vfe_state.bo = NULL;
+
+    dri_bo_unreference(pp_context->stmm.bo);
+    pp_context->stmm.bo = NULL;
+
+    for (i = 0; i < NUM_PP_MODULES && pp_modules; i++) {
+        struct pp_module *pp_module = &pp_modules[i];
+
+        dri_bo_unreference(pp_module->bo);
+        pp_module->bo = NULL;
+    }
+
+    return True;
+}
diff --git a/i965_drv_video/i965_post_processing.h b/i965_drv_video/i965_post_processing.h
new file mode 100644
index 0000000..360ded4
--- /dev/null
+++ b/i965_drv_video/i965_post_processing.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Xiang Haihao <haihao.xiang@intel.com>
+ *
+ */
+
+#ifndef __I965_POST_PROCESSING_H__
+#define __I965_POST_PROCESSING_H__
+
+#define MAX_PP_SURFACES 32
+
+#define I965_PP_FLAG_DEINTERLACING      1
+#define I965_PP_FLAG_AVS                2
+
+enum
+{
+    PP_NULL = 0,
+    PP_NV12_LOAD_SAVE,
+    PP_NV12_SCALING,
+    PP_NV12_AVS,
+    PP_NV12_DNDI,
+};
+
+struct pp_load_save_context
+{
+    int dest_w;
+    int dest_h;
+};
+
+struct pp_scaling_context
+{
+    int dest_w;
+    int dest_h;
+};
+
+struct pp_avs_context
+{
+    int dest_w;
+    int dest_h;
+    int src_w;
+    int src_h;
+};
+
+struct pp_dndi_context
+{
+    int dest_w;
+    int dest_h;
+
+};
+
+struct i965_post_processing_context
+{
+    int current_pp;
+
+    struct {
+        dri_bo *bo;
+    } curbe;
+
+    struct {
+        dri_bo *ss_bo;
+        dri_bo *s_bo;
+    } surfaces[MAX_PP_SURFACES];
+
+    struct {
+        dri_bo *bo;
+    } binding_table;
+
+    struct {
+        dri_bo *bo;
+    } idrt;
+
+    struct {
+        dri_bo *bo;
+    } vfe_state;
+
+    struct {
+        dri_bo *bo;
+        dri_bo *bo_8x8;
+        dri_bo *bo_8x8_uv;
+    } sampler_state_table;
+
+    struct {
+        unsigned int size;
+
+        unsigned int vfe_start;
+        unsigned int cs_start;
+
+        unsigned int num_vfe_entries;
+        unsigned int num_cs_entries;
+
+        unsigned int size_vfe_entry;
+        unsigned int size_cs_entry;
+    } urb;
+
+    struct {
+        dri_bo *bo;
+    } stmm;
+
+    union {
+        struct pp_load_save_context pp_load_save_context;
+        struct pp_scaling_context pp_scaling_context;
+        struct pp_avs_context pp_avs_context;
+        struct pp_dndi_context pp_dndi_context;
+    } private_context;
+
+    int (*pp_x_steps)(void *private_context);
+    int (*pp_y_steps)(void *private_context);
+    int (*pp_set_block_parameter)(void *private_context, int x, int y);
+};
+
+void
+i965_post_processing(VADriverContextP ctx,
+                     VASurfaceID surface,
+                     short srcx,
+                     short srcy,
+                     unsigned short srcw,
+                     unsigned short srch,
+                     short destx,
+                     short desty,
+                     unsigned short destw,
+                     unsigned short desth,
+                     unsigned int pp_index);
+void
+i965_post_processing_once_init(VADriverContextP ctx);
+Bool
+i965_post_processing_terminate(VADriverContextP ctx);
+
+#endif /* __I965_POST_PROCESSING_H__ */
diff --git a/i965_drv_video/i965_render.c b/i965_drv_video/i965_render.c
index c4e8ed8..ceef319 100644
--- a/i965_drv_video/i965_render.c
+++ b/i965_drv_video/i965_render.c
@@ -655,12 +655,20 @@ i965_render_src_surfaces_state(VADriverContextP ctx,
 
     obj_surface = SURFACE(surface);
     assert(obj_surface);
-    assert(obj_surface->bo);
-    w = obj_surface->width;
-    h = obj_surface->height;
-    rw = obj_surface->orig_width;
-    rh = obj_surface->orig_height;
-    region = obj_surface->bo;
+
+    if (obj_surface->pp_out_bo) {
+        w = obj_surface->pp_out_width;
+        h = obj_surface->pp_out_height;
+        rw = obj_surface->orig_pp_out_width;
+        rh = obj_surface->orig_pp_out_height;
+        region = obj_surface->pp_out_bo;
+    } else {
+        w = obj_surface->width;
+        h = obj_surface->height;
+        rw = obj_surface->orig_width;
+        rh = obj_surface->orig_height;
+        region = obj_surface->bo;
+    }
 
     i965_render_src_surface_state(ctx, 1, region, 0, rw, rh, w, I965_SURFACEFORMAT_R8_UNORM);     /* Y */
     i965_render_src_surface_state(ctx, 2, region, 0, rw, rh, w, I965_SURFACEFORMAT_R8_UNORM);
@@ -1454,8 +1462,14 @@ i965_render_put_surface(VADriverContextP ctx,
                         short destx,
                         short desty,
                         unsigned short destw,
-                        unsigned short desth)
+                        unsigned short desth,
+                        unsigned int flag)
 {
+    i965_post_processing(ctx, surface,
+                         srcx, srcy, srcw, srch,
+                         destx, desty, destw, desth,
+                         flag);
+
     i965_render_initialize(ctx);
     i965_surface_render_state_setup(ctx, surface,
                             srcx, srcy, srcw, srch,
@@ -1523,6 +1537,8 @@ i965_render_init(VADriverContextP ctx)
     assert(render_state->curbe.bo);
     render_state->curbe.upload = 0;
 
+    i965_post_processing_once_init(ctx);
+
     return True;
 }
 
@@ -1533,6 +1549,8 @@ i965_render_terminate(VADriverContextP ctx)
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     struct i965_render_state *render_state = &i965->render_state;
 
+    i965_post_processing_terminate(ctx);
+
     dri_bo_unreference(render_state->curbe.bo);
     render_state->curbe.bo = NULL;
 
diff --git a/i965_drv_video/i965_render.h b/i965_drv_video/i965_render.h
index 9abb81f..84b50f2 100644
--- a/i965_drv_video/i965_render.h
+++ b/i965_drv_video/i965_render.h
@@ -31,6 +31,8 @@
 #define MAX_RENDER_SURFACES     16
 #define MAX_SAMPLERS            16
 
+#include "i965_post_processing.h"
+
 struct i965_render_state
 {
     struct {
@@ -65,6 +67,9 @@ struct i965_render_state
 
     int interleaved_uv;
     struct intel_region *draw_region;
+
+    int pp_flag; /* 0: disable, 1: enable */
+    struct i965_post_processing_context pp_context;
 };
 
 Bool i965_render_init(VADriverContextP ctx);
@@ -78,7 +83,8 @@ void i965_render_put_surface(VADriverContextP ctx,
                              short destx,
                              short desty,
                              unsigned short destw,
-                             unsigned short desth);
+                             unsigned short desth,
+                             unsigned int flag);
 
 
 void
diff --git a/i965_drv_video/i965_structs.h b/i965_drv_video/i965_structs.h
index d133446..f8be616 100644
--- a/i965_drv_video/i965_structs.h
+++ b/i965_drv_video/i965_structs.h
@@ -639,4 +639,329 @@ struct i965_cc_unit_state
    } cc7;
 };
 
+struct i965_sampler_8x8
+{
+    struct {
+        unsigned int pad0:16;
+        unsigned int chroma_key_index:2;
+        unsigned int chroma_key_enable:1;
+        unsigned int pad1:8;
+        unsigned int ief_filter_size:1;
+        unsigned int ief_filter_type:1;
+        unsigned int ief_bypass:1;
+        unsigned int pad2:1;
+        unsigned int avs_filter_type:1;
+    } dw0;
+
+    struct {
+        unsigned int pad0:5;
+        unsigned int sampler_8x8_state_pointer:27;
+    } dw1;
+    
+    struct {
+        unsigned int weak_edge_threshold:4;
+        unsigned int strong_edge_threshold:4;
+        unsigned int global_noise_estimation:8;
+        unsigned int pad0:16;
+    } dw2;
+
+    struct {
+        unsigned int r3x_coefficient:5;
+        unsigned int pad0:1;
+        unsigned int r3c_coefficient:5;
+        unsigned int pad1:3;
+        unsigned int gain_factor:6;
+        unsigned int non_edge_weight:3;
+        unsigned int pad2:1;
+        unsigned int regular_weight:3;
+        unsigned int pad3:1;
+        unsigned int strong_edge_weight:3;
+        unsigned int pad4:1;
+    } dw3;
+
+    struct {
+        unsigned int pad0:2;
+        unsigned int mr_boost:1;
+        unsigned int mr_threshold:4;
+        unsigned int steepness_boost:1;
+        unsigned int steepness_threshold:4;
+        unsigned int pad1:2;
+        unsigned int r5x_coefficient:5;
+        unsigned int pad2:1;
+        unsigned int r5cx_coefficient:5;
+        unsigned int pad3:1;
+        unsigned int r5c_coefficient:5;
+        unsigned int pad4:1;
+    } dw4;
+
+    struct {
+        unsigned int pwl1_point_1:8;
+        unsigned int pwl1_point_2:8;
+        unsigned int pwl1_point_3:8;
+        unsigned int pwl1_point_4:8;
+    } dw5;
+
+    struct {
+        unsigned int pwl1_point_5:8;
+        unsigned int pwl1_point_6:8;
+        unsigned int pwl1_r3_bias_0:8;
+        unsigned int pwl1_r3_bias_1:8;
+    } dw6;
+
+    struct {
+        unsigned int pwl1_r3_bias_2:8;
+        unsigned int pwl1_r3_bias_3:8;
+        unsigned int pwl1_r3_bias_4:8;
+        unsigned int pwl1_r3_bias_5:8;
+    } dw7;
+
+    struct {
+        unsigned int pwl1_r3_bias_6:8;
+        unsigned int pwl1_r5_bias_0:8;
+        unsigned int pwl1_r5_bias_1:8;
+        unsigned int pwl1_r5_bias_2:8;
+    } dw8;
+
+    struct {
+        unsigned int pwl1_r5_bias_3:8;
+        unsigned int pwl1_r5_bias_4:8;
+        unsigned int pwl1_r5_bias_5:8;
+        unsigned int pwl1_r5_bias_6:8;
+    } dw9;
+
+    struct {
+        int pwl1_r3_slope_0:8;
+        int pwl1_r3_slope_1:8;
+        int pwl1_r3_slope_2:8;
+        int pwl1_r3_slope_3:8;
+    } dw10;
+
+    struct {
+        int pwl1_r3_slope_4:8;
+        int pwl1_r3_slope_5:8;
+        int pwl1_r3_slope_6:8;
+        int pwl1_r5_slope_0:8;
+    } dw11;
+
+    struct {
+        int pwl1_r5_slope_1:8;
+        int pwl1_r5_slope_2:8;
+        int pwl1_r5_slope_3:8;
+        int pwl1_r5_slope_4:8;
+    } dw12;
+
+    struct {
+        int pwl1_r5_slope_5:8;
+        int pwl1_r5_slope_6:8;
+        unsigned int limiter_boost:4;
+        unsigned int pad0:4;
+        unsigned int minimum_limiter:4;
+        unsigned int maximum_limiter:4;
+    } dw13;
+
+    struct {
+        unsigned int pad0:8;
+        unsigned int clip_limiter:10;
+        unsigned int pad1:14;
+    } dw14;
+
+    unsigned int dw15; /* Just a pad */
+};
+
+struct i965_sampler_8x8_coefficient
+{
+    struct {
+        int table_0x_filter_c0:8;
+        int table_0x_filter_c1:8;
+        int table_0x_filter_c2:8;
+        int table_0x_filter_c3:8;
+    } dw0;
+
+    struct {
+        int table_0x_filter_c4:8;
+        int table_0x_filter_c5:8;
+        int table_0x_filter_c6:8;
+        int table_0x_filter_c7:8;
+    } dw1;
+
+    struct {
+        int table_0y_filter_c0:8;
+        int table_0y_filter_c1:8;
+        int table_0y_filter_c2:8;
+        int table_0y_filter_c3:8;
+    } dw2;
+
+    struct {
+        int table_0y_filter_c4:8;
+        int table_0y_filter_c5:8;
+        int table_0y_filter_c6:8;
+        int table_0y_filter_c7:8;
+    } dw3;
+
+    struct {
+        int pad0:16;
+        int table_1x_filter_c2:8;
+        int table_1x_filter_c3:8;
+    } dw4;
+
+    struct {
+        int table_1x_filter_c4:8;
+        int table_1x_filter_c5:8;
+        int pad0:16;
+    } dw5;
+
+    struct {
+        int pad0:16;
+        int table_1y_filter_c2:8;
+        int table_1y_filter_c3:8;
+    } dw6;
+
+    struct {
+        int table_1y_filter_c4:8;
+        int table_1y_filter_c5:8;
+        int pad0:16;
+    } dw7;
+};
+
+struct i965_sampler_8x8_state
+{
+    struct i965_sampler_8x8_coefficient coefficients[17];
+
+    struct {
+        unsigned int transition_area_with_8_pixels:3;
+        unsigned int pad0:1;
+        unsigned int transition_area_with_4_pixels:3;
+        unsigned int pad1:1;
+        unsigned int max_derivative_8_pixels:8;
+        unsigned int max_derivative_4_pixels:8;
+        unsigned int default_sharpness_level:8;
+    } dw136;
+
+    struct {
+        unsigned int bit_field_name:1;
+        unsigned int adaptive_filter_for_all_channel:1;
+        unsigned int pad0:19;
+        unsigned int bypass_y_adaptive_filtering:1;
+        unsigned int bypass_x_adaptive_filtering:1;
+        unsigned int pad1:9;
+    } dw137;
+};
+
+struct i965_surface_state2
+{
+    struct {
+        unsigned int surface_base_address;
+    } ss0;
+
+    struct {
+        unsigned int cbcr_pixel_offset_v_direction:2;
+        unsigned int pad0:4;
+        unsigned int width:13;
+        unsigned int height:13;
+    } ss1;
+
+    struct {
+        unsigned int tile_walk:1;
+        unsigned int tiled_surface:1;
+        unsigned int half_pitch_for_chroma:1;
+        unsigned int pitch:17;
+        unsigned int pad0:2;
+        unsigned int surface_object_control_data:4;
+        unsigned int pad1:1;
+        unsigned int interleave_chroma:1;
+        unsigned int surface_format:4;
+    } ss2;
+
+    struct {
+        unsigned int y_offset_for_cb:13;
+        unsigned int pad0:3;
+        unsigned int x_offset_for_cb:13;
+        unsigned int pad1:3;
+    } ss3;
+
+    struct {
+        unsigned int y_offset_for_cr:13;
+        unsigned int pad0:3;
+        unsigned int x_offset_for_cr:13;
+        unsigned int pad1:3;
+    } ss4;
+};
+
+struct i965_sampler_dndi
+{
+    struct {
+        unsigned int denoise_asd_threshold:8;
+        unsigned int denoise_history_delta:8;
+        unsigned int denoise_maximum_history:8;
+        unsigned int denoise_stad_threshold:8;
+    } dw0;
+
+    struct {
+        unsigned int denoise_threshold_for_sum_of_complexity_measure:8;
+        unsigned int denoise_moving_pixel_threshold:5;
+        unsigned int stmm_c2:3;
+        unsigned int low_temporal_difference_threshold:6;
+        unsigned int pad0:2;
+        unsigned int temporal_difference_threshold:6;
+        unsigned int pad1:2;
+    } dw1;
+
+    struct {
+        unsigned int block_noise_estimate_noise_threshold:8;
+        unsigned int block_noise_estimate_edge_threshold:8; 
+        unsigned int denoise_edge_threshold:8;
+        unsigned int good_neighbor_threshold:8;
+   } dw2;
+
+    struct {
+        unsigned int maximum_stmm:8;
+        unsigned int multipler_for_vecm:6;
+        unsigned int pad0:2;
+        unsigned int blending_constant_across_time_for_small_values_of_stmm:8;
+        unsigned int blending_constant_across_time_for_large_values_of_stmm:7;
+        unsigned int stmm_blending_constant_select:1;
+    } dw3;
+
+    struct {
+        unsigned int sdi_delta:8;
+        unsigned int sdi_threshold:8;
+        unsigned int stmm_output_shift:4;
+        unsigned int stmm_shift_up:2;
+        unsigned int stmm_shift_down:2;
+        unsigned int minimum_stmm:8;
+    } dw4;
+
+    struct {
+        unsigned int fmd_temporal_difference_threshold:8;
+        unsigned int sdi_fallback_mode_2_constant:8;
+        unsigned int sdi_fallback_mode_1_t2_constant:8;
+        unsigned int sdi_fallback_mode_1_t1_constant:8;
+    } dw5;
+
+    struct {
+        unsigned int dn_enable:1;
+        unsigned int di_enable:1;
+        unsigned int di_partial:1;
+        unsigned int dndi_top_first:1;
+        unsigned int dndi_stream_id:1;
+        unsigned int dndi_first_frame:1;
+        unsigned int progressive_dn:1;
+        unsigned int pad0:1;
+        unsigned int fmd_tear_threshold:6;
+        unsigned int pad1:2;
+        unsigned int fmd2_vertical_difference_threshold:8;
+        unsigned int fmd1_vertical_difference_threshold:8;
+    } dw6;
+
+    struct {
+        unsigned int pad0:8;
+        unsigned int fmd_for_1st_field_of_current_frame:2;
+        unsigned int pad1:6;
+        unsigned int fmd_for_2nd_field_of_previous_frame:2;
+        unsigned int vdi_walker_enable:1;
+        unsigned int pad2:4;
+        unsigned int column_width_minus1:9;
+    } dw7;
+};
+
 #endif /* _I965_STRUCTS_H_ */
diff --git a/i965_drv_video/intel_batchbuffer.c b/i965_drv_video/intel_batchbuffer.c
index abe548e..15c3201 100644
--- a/i965_drv_video/intel_batchbuffer.c
+++ b/i965_drv_video/intel_batchbuffer.c
@@ -37,7 +37,7 @@ static void
 intel_batchbuffer_reset(struct intel_batchbuffer *batch)
 {
     struct intel_driver_data *intel = batch->intel; 
-    int batch_size = batch->flag == I915_EXEC_RENDER ? BATCH_SIZE : (BATCH_SIZE * 8);
+    int batch_size = BATCH_SIZE;
 
     assert(batch->flag == I915_EXEC_RENDER ||
            batch->flag == I915_EXEC_BSD);
diff --git a/i965_drv_video/intel_driver.h b/i965_drv_video/intel_driver.h
index ffa8cad..1e2adfa 100644
--- a/i965_drv_video/intel_driver.h
+++ b/i965_drv_video/intel_driver.h
@@ -17,7 +17,7 @@
 #define INLINE
 #endif
 
-#define BATCH_SIZE      0x10000
+#define BATCH_SIZE      0x80000
 #define BATCH_RESERVED  0x10
 
 #define CMD_MI                                  (0x0 << 29)
diff --git a/i965_drv_video/shaders/Makefile.am b/i965_drv_video/shaders/Makefile.am
index 2fd019b..e2b6223 100644
--- a/i965_drv_video/shaders/Makefile.am
+++ b/i965_drv_video/shaders/Makefile.am
@@ -1 +1 @@
-SUBDIRS = h264 mpeg2 render
+SUBDIRS = h264 mpeg2 render post_processing
diff --git a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm
new file mode 100644
index 0000000..f6c3a33
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm
@@ -0,0 +1,53 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: AYUV_Load_16x8.asm
+//----------------------------------------------------------------
+
+
+#include "AYUV_Load_16x8.inc"
+
+// In order to load 64x8 AYUV data (16x8 pixels), we need to divide the data 
+// into two regions and load them separately. 
+//
+//       32 byte         32 byte
+//|----------------|----------------|
+//|                |                |
+//|       A        |       B        |8
+//|                |                |
+//|                |                |
+//|----------------|----------------|
+
+// Load the first 32x8 data block
+// Packed data block should be loaded as 32x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w    // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            2:w          { NoDDClr }      // H. block origin need to be four times larger
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV:ud         { NoDDChk }      // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+
+//Load the second 32x8 data block    
+// Offset the origin X - move to next 32 colomns
+    add (1) rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    32:w                        // Increase X origin by 8 
+    
+// Size stays the same - 32x8
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud                               // Copy message description to message header
+    send (8) udSRC_YUV(8)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+
+// Give AYUV region addresses to address register
+    mov (1) SRC_YUV_OFFSET<1>:ud 0x00400038*32:ud                               //Address registers contain starting addresses of two halves 
+    
+//Directly move the data to destination
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16) uwDEST_Y(%1)<1> r[SRC_YUV_OFFSET,%1*32+2]<8,4>:ub
+        mov (16) uwDEST_U(%1)<1> r[SRC_YUV_OFFSET,%1*32+1]<8,4>:ub
+        mov (16) uwDEST_V(%1)<1> r[SRC_YUV_OFFSET,%1*32+0]<8,4>:ub
+    }        
+    
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc
new file mode 100644
index 0000000..422dfb3
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc
@@ -0,0 +1,43 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: AYUV_Load_16x8.inc
+//
+// AYUV data are first loaded to bottom I/O REGION_2, then unpacked to planar data
+// and stored in top I/O REGION_1
+
+#undef 	nY_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS      8       // Number of Y rows per block
+
+#define nDPR_BLOCK_SIZE_YUV           nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Y block size 32x8
+#define nDPR_MSG_SIZE_YUV             nRESLEN_8                         // # of MRF's to hold Y block data (8)
+
+//Temporary storage for unpacked AYUV data
+#define     rUNPACK_TEMP     REG(r,nTEMP0)
+.declare    udUNPACK_TEMP    Base=rUNPACK_TEMP    ElementSize=4    SrcRegion=<8;8,1>    Type=ud        //1 GRF
+.declare    ubUNPACK_TEMP    Base=rUNPACK_TEMP    ElementSize=1    SrcRegion=<32;32,1>    Type=ub        //1 GRF
+
+.declare ubBOT_Y_IO     Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub
+
+
+#define udSRC_YUV               udBOT_Y_IO
+#define ubSRC_YUV               ubBOT_Y_IO
+#define nSRC_YUV_REG            nBOT_Y
+
+#define uwDEST_Y                uwTOP_Y
+#define uwDEST_U                uwTOP_U
+#define uwDEST_V                uwTOP_V
+
+#define SRC_YUV_OFFSET a0.0
+
+#define nSRC_REGION nREGION_1    // REGION_1 will be the source region for first kernel
+
+// End of AYUV_Load_16x8.inc
diff --git a/i965_drv_video/shaders/post_processing/Common/Expansion.inc b/i965_drv_video/shaders/post_processing/Common/Expansion.inc
new file mode 100644
index 0000000..7f3d5aa
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/Expansion.inc
@@ -0,0 +1,31 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: Expansion.inc
+// Number of U/V rows per block definition
+#undef 	nUV_NUM_OF_ROWS
+#ifdef EXPAND_9x5
+	#define nUV_NUM_OF_ROWS     6
+#else
+	#define nUV_NUM_OF_ROWS     8
+#endif
+
+// Source/destination region definitions
+#undef uwDEST_U
+#undef uwDEST_V
+#if (nSRC_REGION==nREGION_1)
+	#define uwDEST_U          uwTOP_U
+	#define uwDEST_V          uwTOP_V
+#elif (nSRC_REGION==nREGION_2)
+	#define uwDEST_U          uwBOT_U
+	#define uwDEST_V          uwBOT_V
+#endif
+
+// End of Expansion.inc
diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm
new file mode 100644
index 0000000..2817175
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm
@@ -0,0 +1,47 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: IMC3_Load_8x4.asm
+//
+//----------------------------------------------------------------
+
+#define  IMC3_LOAD_8x4
+#include "PL3_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 8x4 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x4)
+
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+
+// End of IMC3_Load_8x4
diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm
new file mode 100644
index 0000000..3c96e72
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm
@@ -0,0 +1,47 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: IMC3_Load_8x5.asm
+//
+//----------------------------------------------------------------
+
+#define  IMC3_LOAD_8x5
+#include "PL3_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 8x5 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x5)
+
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+
+// End of IMC3_Load_8x5
diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm
new file mode 100644
index 0000000..d286cbb
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm
@@ -0,0 +1,50 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: IMC3_Load_9x5.asm
+//
+//----------------------------------------------------------------
+//  This module loads 16x8 Y, 9x5 U and 9x5 V planar data blocks for CSC module
+//	and stores it in byte-aligned format.
+//----------------------------------------------------------------
+
+#define  IMC3_LOAD_9x5
+#include "PL3_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Block width and height (16x8)
+
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 9x5 planar U and V -----------------------------------------------------
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (12x5)
+
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud
+    mov  (8) mMSGHDRV<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>         ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for(nUV_NUM_OF_ROWS-2; >-1; -1) {
+        mov (16)  uwDEST_U(0, %1*16)<1>        ubSRC_U(0, %1*16)
+        mov (16)  uwDEST_V(0, %1*16)<1>        ubSRC_V(0, %1*16)
+    }
+
+// End of IMC3_Load_9x5
diff --git a/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm b/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm
new file mode 100644
index 0000000..cb0fd41
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm
@@ -0,0 +1,18 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#ifdef GT	// to remove error messages of un-initialized GRF
+	.declare 	udGRF_space	 	 Base=r0.0 ElementSize=4 SrcRegion=REGION(8,1) Type=ud	
+
+	$for (7; <80; 1) {
+		mov (8) udGRF_space(%1)<1>	0:ud
+	}
+#else
+#endif
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm
new file mode 100644
index 0000000..8a9fd96
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm
@@ -0,0 +1,84 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+/////////////////////////////////////////////////////////////////////////////////
+// Multiple_Loop.asm
+
+
+// This lable is for satisfying component kernel build.
+// DL will remove this label and reference the real one in Multiple_Loop_Head.asm.
+#if defined(COMPONENT)
+VIDEO_PROCESSING_LOOP:
+#endif
+
+
+//===== Possible build flags for component kernels
+// 1) INC_SCALING
+// 2) INC_BLENDING
+// 3) INC_BLENDING and INC_SCALING
+// 4) (no flags)
+
+
+#define MxN_MULTIPLE_BLOCKS
+
+//------------------------------------------------------------------------------
+#if defined(MxN_MULTIPLE_BLOCKS)
+// Do Multiple Block Processing ------------------------------------------------
+
+	// The 1st block has been processed before entering the loop
+
+	// Processed all blocks?
+	add.z.f0.0	(1)	wNUM_BLKS:w	wNUM_BLKS:w	-1:w
+
+	// Reached multi-block width?
+	add			(1)	wORIX:w		wORIX:w		16:w
+	cmp.l.f0.1	(1)	null:w		acc0.0:w	wFRAME_ENDX:w	// acc0.0 has wORIX
+
+	#if defined(INC_SCALING)
+	// Update SRC_VID_H_ORI for scaling
+		mul	(1)	REG(r,nTEMP0):f		fVIDEO_STEP_X:f		16.0:f
+		add	(1)	fSRC_VID_H_ORI:f	REG(r,nTEMP0):f		fSRC_VID_H_ORI:f
+	#endif
+
+	#if defined(INC_BLENDING)
+	// Update SRC_ALPHA_H_ORI for blending
+		mul	(1)	REG(r,nTEMP0):f		fALPHA_STEP_X:f		16.0:f
+		add	(1)	fSRC_ALPHA_H_ORI:f	REG(r,nTEMP0):f		fSRC_ALPHA_H_ORI:f
+	#endif
+
+	(f0.0)jmpi	(1)	END_VIDEO_PROCESSING	// All blocks are done - Exit loop
+
+	(f0.1)jmpi	(1)	VIDEO_PROCESSING_LOOP	// If not the end of row, goto the beginning of the loop
+
+	//If end of row, restart Horizontal offset and calculate Vertical offsets next row.
+	mov	(1)		wORIX:w		wCOPY_ORIX:w
+	add	(1)		wORIY:w		wORIY:w			8:w
+
+	#if defined(INC_SCALING)
+	// Update SRC_VID_H_ORI and SRC_VID_V_ORI for scaling
+		mov	(1)		fSRC_VID_H_ORI:f	fFRAME_VID_ORIX:f	// Reset normalised X origin to 0 for video and alpha
+		mul	(1)		REG(r,nTEMP0):f		fVIDEO_STEP_Y:f		8.0:f
+		add	(1)		fSRC_VID_V_ORI:f	REG(r,nTEMP0):f		fSRC_VID_V_ORI:f
+	#endif
+
+	#if defined(INC_BLENDING)
+	// Update SRC_ALPHA_H_ORI and SRC_ALPHA_V_ORI for blending
+		mov	(1)		fSRC_ALPHA_H_ORI:f	fFRAME_ALPHA_ORIX:f	// Reset normalised X origin to 0 for video and alpha
+		mul	(1)		REG(r,nTEMP0):f		fALPHA_STEP_Y:f		8.0:f
+		add	(1)		fSRC_ALPHA_V_ORI:f	REG(r,nTEMP0):f		fSRC_ALPHA_V_ORI:f
+	#endif
+
+	jmpi (1)	VIDEO_PROCESSING_LOOP	// Continue Loop
+
+END_VIDEO_PROCESSING:
+	nop
+
+#endif
+END_THREAD	// End of Thread
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm
new file mode 100644
index 0000000..77d8b94
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm
@@ -0,0 +1,23 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//////////////////////////////////////////////////////////////////////////////////
+// Multiple_Loop_Head.asm
+// This code sets up the loop control for multiple blocks per thread
+
+	mul (1)	wFRAME_ENDX:w	ubBLK_CNT_X:ub	16:uw	{ NoDDClr }				// Build multi-block loop counters
+	mov (1) wNUM_BLKS:w		ubNUM_BLKS:ub			{ NoDDClr, NoDDChk }	// Copy num blocks to word variable
+	mov (1) wCOPY_ORIX:w	wORIX:w					{ NoDDChk }				// Copy multi-block origin in pixel 
+	mov (2) fFRAME_VID_ORIX<1>:f			fSRC_VID_H_ORI<4;2,2>:f			// Copy src video origin for scaling, and alpha origin for blending
+	add (1)	wFRAME_ENDX:w	wFRAME_ENDX:w	wORIX:w							// Continue building multi-block loop counters
+
+VIDEO_PROCESSING_LOOP:		// Loop back entry point as the biginning of the loop for multiple blocks
+	
+// Beginning of the loop
diff --git a/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm b/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm
new file mode 100644
index 0000000..54af8d1
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: NV11_Load_4x8.asm
+//----------------------------------------------------------------
+
+#define  NV11_LOAD_4x8
+#include "PL2_Load.inc"
+
+// Load 16x8 NV11 Y ------------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 8x8 NV11 UV ----------------------------------------------------------
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/4-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<32;16,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+
+// End of NV11_Load_4x8
diff --git a/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm b/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm
new file mode 100644
index 0000000..86a1d35
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: NV11_Load_5x8.asm
+//----------------------------------------------------------------
+
+#define  NV11_LOAD_5x8
+#include "PL2_Load.inc"
+
+// Load 16x8 NV11 Y ------------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 12x8 NV11 UV ---------------------------------------------------------
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (12x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+
+// End of NV11_Load_5x8
diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm
new file mode 100644
index 0000000..dbc47d4
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: NV12_Load_8x4.asm
+//----------------------------------------------------------------
+
+#define  NV12_LOAD_8x4
+#include "PL2_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 8x4 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x4)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<32;16,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+
+// End of NV12_Load_8x4
diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm
new file mode 100644
index 0000000..85f5ec7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: NV12_Load_8x5.asm
+//----------------------------------------------------------------
+
+#define  NV12_LOAD_8x5
+#include "PL2_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 8x5 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+
+// End of NV12_Load_8x5
diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm
new file mode 100644
index 0000000..b19f0b2
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: NV12_Load_9x5.asm
+//----------------------------------------------------------------
+
+#define  NV12_LOAD_9x5
+#include "PL2_Load.inc"
+
+// Load 16x8 planar Y ----------------------------------------------------------
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+// Load 9x5 planar U and V -----------------------------------------------------
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (20x5)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
+    }
+#endif
+	$for(nUV_NUM_OF_ROWS-2; >-1; -1) {
+        mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<16;8,2>
+        mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<16;8,2>
+    }
+
+// End of NV12_Load_9x5
diff --git a/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm b/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm
new file mode 100644
index 0000000..70d07eb
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm
@@ -0,0 +1,41 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: P208_Load_8x8.asm
+//----------------------------------------------------------------
+
+#define  P208_LOAD_8x8
+#include "PL2_Load.inc"
+
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    mov  (1) rMSGSRC.2<1>:ud	nDPR_BLOCK_SIZE_Y:ud							// Y Block width and height (16x8) (U/V block size is the same)
+
+// Load 16x8 P208 Y ------------------------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+	// Load 16x8 planar UV -----------------------------------------------------
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (0; <nY_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_Y(0,%1*16)	ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov	(16)	uwDEST_U(0,%1*16)	ubSRC_U(0,%1*32)<32;16,2>
+        mov	(16)	uwDEST_V(0,%1*16)	ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+
+// End of P208_Load_8x8.asm
diff --git a/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm b/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm
new file mode 100644
index 0000000..c6ff086
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: P208_Load_9x8.asm
+//----------------------------------------------------------------
+
+#define  P208_LOAD_9x8
+#include "PL2_Load.inc"
+
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+
+// Load 16x8 P208 Y ------------------------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_Y:ud                               // Y block width and height (16x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud
+#endif
+
+	// Load 16x8 planar UV -----------------------------------------------------
+    mov  (1) rMSGSRC.2<1>:ud	nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (20x8)
+    mov  (8) mMSGHDRU<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
+
+// Convert to word-aligned format ----------------------------------------------
+#if !defined(LOAD_UV_ONLY)
+    $for (0; <nY_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_Y(0,%1*16)	ubSRC_Y(0,%1*16)
+    }
+#endif
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        mov	(16)	uwDEST_U(0,%1*16)	ubSRC_U(0,%1*32)<32;16,2>
+        mov	(16)	uwDEST_V(0,%1*16)	ubSRC_U(0,%1*32+1)<32;16,2>
+    }
+
+// End of P208_Load_9x8.asm
diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load.inc b/i965_drv_video/shaders/post_processing/Common/PA_Load.inc
new file mode 100644
index 0000000..dee657e
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load.inc
@@ -0,0 +1,42 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PA_Load.inc
+//
+// YUV422 data are first loaded to bottom I/O REGION_2, then unpacked to planar data
+// and stored in top I/O REGION_1
+
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS      8       // Number of Y rows per block
+#define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+
+#if defined(PA_LOAD_8x8)
+        #define nDPR_BLOCK_SIZE_YUV           nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Y block size 32x8
+        #define nDPR_MSG_SIZE_YUV             nRESLEN_8                         // # of MRF's to hold Y block data (8)
+#endif
+#if defined(PA_LOAD_9x8)
+        #define nDPR_BLOCK_SIZE_YUV_MAIN      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // Main YUV block size 32x8
+        #define nDPR_MSG_SIZE_YUV_MAIN        nRESLEN_8                         // # of MRF's to hold Y block data (8)
+        #define nDPR_BLOCK_SIZE_YUV_ADDITION  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_8    // Additional YUV block size 4x8
+        #define nDPR_MSG_SIZE_YUV_ADDITION    nRESLEN_1                         // # of MRF's to hold Y block data (8)
+#endif
+
+#define udSRC_YUV               udBOT_Y_IO
+#define nSRC_YUV_REG            nBOT_Y
+
+#define uwDEST_Y                uwTOP_Y
+#define uwDEST_U                uwTOP_U
+#define uwDEST_V                uwTOP_V
+
+#define nSRC_REGION nREGION_1    // REGION_1 will be the source region for first kernel
+
+// End of PA_Load.inc
diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm b/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm
new file mode 100644
index 0000000..3569bd1
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm
@@ -0,0 +1,33 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PA_Load_8x8.asm
+//----------------------------------------------------------------
+
+#define  PA_LOAD_8x8
+#include "PA_Load.inc"
+
+//  Load 16x8 packed data block
+//  Packed data block should be loaded as 32x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            1:w                              // H. block origin need to be doubled
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV:ud                             // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud
+
+//  Unpack to "planar" YUV422 format in word-aligned bytes
+    add  (4) pCF_Y_OFFSET<1>:uw    ubSRC_CF_OFFSET<4;4,1>:ub    nSRC_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16)  uwDEST_Y(0, %1*16)<1>     r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
+        mov (8)   uwDEST_U(0, %1*8)<1>      r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
+        mov (8)   uwDEST_V(0, %1*8)<1>      r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
+    }
+
+// End of PA_Load_8x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm b/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm
new file mode 100644
index 0000000..90e56e7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm
@@ -0,0 +1,47 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PA_Load_9x8.asm
+//----------------------------------------------------------------
+//  This module loads 16x8 Y, 9x8 U and 9x8 V planar data blocks for CSC module 
+//	and stores it in word-aligned format.
+//----------------------------------------------------------------
+
+#define  PA_LOAD_9x8
+#include "PA_Load.inc"
+
+//  Load 18x8 packed data block
+//  Packed data block should be loaded as 36x8 pixel block
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Block origin
+    shl  (1) rMSGSRC.0<1>:d     acc0:w            1:w                              // H. block origin need to be doubled
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV_MAIN:ud                        // Block width and height (32x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(0)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_MAIN+nBI_CURRENT_SRC_YUV:ud
+
+    add  (1) rMSGSRC.0<1>:d     rMSGSRC.0:d       32:w                             //the last 4 pixels are read again for optimization
+    mov  (1) rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_YUV_ADDITION:ud                    // Block width and height (4x8)
+    mov  (8) mMSGHDRY<1>:ud     rMSGSRC<8;8,1>:ud
+    send (8) udSRC_YUV(8)<1>    mMSGHDRY    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_ADDITION+nBI_CURRENT_SRC_YUV:ud
+
+//  Unpack to "planar" YUV422 format in word-aligned bytes
+    add  (4) pCF_Y_OFFSET<1>:uw    ubSRC_CF_OFFSET<4;4,1>:ub    nSRC_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16)  uwDEST_Y(0, %1*16)<1>     r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2)
+        mov (8)   uwDEST_U(0, %1*16)<1>     r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4)
+        mov (8)   uwDEST_V(0, %1*16)<1>     r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4)
+    }
+
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (1)   uwDEST_U(0, %1*16+8)<1>   r[pCF_U_OFFSET, %1*4+256]REGION(1,0)
+        mov (1)   uwDEST_V(0, %1*16+8)<1>   r[pCF_V_OFFSET, %1*4+256]REGION(1,0)
+    }
+	//UV expansion done in PL9x8_PL16x8.asm module
+
+// End of PA_Load_9x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm
new file mode 100644
index 0000000..4461c89
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm
@@ -0,0 +1,38 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL16x8_PL8x4.asm
+//----------------------------------------------------------------
+
+#include "common.inc"
+
+#ifndef DEST_U 	//DEST_U, DEST_V not defined
+
+	#if (nSRC_REGION==nREGION_1)
+		#define DEST_Y		uwTOP_Y
+		#define DEST_U		uwTOP_U
+		#define DEST_V		uwTOP_V
+	#elif (nSRC_REGION==nREGION_2)
+		#define DEST_Y		uwBOT_Y
+		#define DEST_U		uwBOT_U
+		#define DEST_V		uwBOT_V
+	#endif
+	
+#endif
+
+//Convert 444 from sampler to 422
+$for (0, 0; <8; 2, 1) {
+	mov	(8)	DEST_U(0,%2*8)<1>	DEST_U(%1)<16;8,2>
+	mov	(8)	DEST_V(0,%2*8)<1>	DEST_V(%1)<16;8,2>	
+}
+
+// Re-define new number of lines
+#undef nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS     4
diff --git a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm
new file mode 100644
index 0000000..fd592db
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm
@@ -0,0 +1,36 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL16x8_PL8x8.asm
+//----------------------------------------------------------------
+
+#include "common.inc"
+
+#ifndef DEST_U
+
+	//DEST_U, DEST_V not defined
+	#if (nSRC_REGION==nREGION_1)
+		#define DEST_Y		uwTOP_Y
+		#define DEST_U		uwTOP_U
+		#define DEST_V		uwTOP_V
+	#elif (nSRC_REGION==nREGION_2)
+		#define DEST_Y		uwBOT_Y
+		#define DEST_U		uwBOT_U
+		#define DEST_V		uwBOT_V
+	#endif
+	
+#endif
+
+
+//Convert 444 from sampler to 422
+$for (0, 0; <8; 2, 1) {
+	mov		DEST_U(%2)<1>	DEST_U(%1)<16;8,2>
+	mov		DEST_V(%2)<1>	DEST_V(%1)<16;8,2>	
+}
diff --git a/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc b/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc
new file mode 100644
index 0000000..9feeba6
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc
@@ -0,0 +1,78 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL2_Load.inc
+
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS      8                                         // Number of Y rows per block
+#define nDPR_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8           // Y block size 16x8
+#define nDPR_MSG_SIZE_Y     nRESLEN_4                                 // # of MRF's to hold Y block data (4)
+
+
+#if defined(NV11_LOAD_4x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8    // U/V block size 8x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+
+#if defined(NV11_LOAD_5x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_12+nBLOCK_HEIGHT_8   // U/V block size 12x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_4                         // # of MRF's to hold U/V block data (4)
+#endif
+#if defined(NV12_LOAD_8x4)
+        #define nUV_NUM_OF_ROWS     4                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // U/V block size 16x4
+        #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+#if defined(NV12_LOAD_8x5)
+        #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_5   // U/V block size 16x5
+        #define nDPR_MSG_SIZE_UV    nRESLEN_3                         // # of MRF's to hold U/V block data (3)
+#endif
+#if defined(NV12_LOAD_9x5)
+        #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_20+nBLOCK_HEIGHT_5   // U/V block size 20x5
+        #define nDPR_MSG_SIZE_UV    nRESLEN_5                         // # of MRF's to hold U/V block data (5)
+#endif
+#if defined(P208_LOAD_8x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // U/V block size 16x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_4                         // # of MRF's to hold U/V block data (4)
+#endif
+#if defined(P208_LOAD_9x8)
+        #define nUV_NUM_OF_ROWS     8                                 // Number of U/V rows per block
+        #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_20+nBLOCK_HEIGHT_8   // U/V block size 20x8
+        #define nDPR_MSG_SIZE_UV    nRESLEN_8                         // # of MRF's to hold U/V block data (8)
+#endif
+
+// Source/destination region definitions
+#if !defined(udSRC_Y)
+        #define udSRC_Y  udBOT_Y_IO     // Default Y source region is top Y region
+#endif
+
+#if !defined(udSRC_U)
+        #define udSRC_U  udBOT_U_IO     // Default U source region is top U region
+#endif
+
+#define ubSRC_Y           ubBOT_Y
+#define nSRC_Y_REG        nBOT_Y
+#define ubSRC_U           ubBOT_U
+#define nSRC_U_REG        nBOT_U
+
+#define uwDEST_Y          uwTOP_Y       // However they can be transferred to word-aligned byte if desired
+#define uwDEST_U          uwTOP_U
+#define uwDEST_V          uwTOP_V
+
+#define nSRC_REGION       nREGION_1     // REGION_1 will be the source region for first kernel
+
+// End of PL2_Load.inc
diff --git a/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc b/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc
new file mode 100644
index 0000000..323df08
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc
@@ -0,0 +1,59 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL3_Load.inc
+
+#undef 	nY_NUM_OF_ROWS
+#undef 	nUV_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS      8                                     // Number of Y rows per block
+#define nDPR_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8       // Y block size 16x8
+#define nDPR_MSG_SIZE_Y     nRESLEN_4                             // # of MRF's to hold Y block data (4)
+
+#if defined(IMC3_LOAD_8x4)
+    #define nUV_NUM_OF_ROWS     4                                 // Number of U/V rows per block
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4    // U/V block size 8x4
+    #define nDPR_MSG_SIZE_UV    nRESLEN_1                         // # of MRF's to hold U/V block data (1)
+#endif
+#if defined(IMC3_LOAD_8x5)
+    #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_5    // U/V block size 8x5
+    #define nDPR_MSG_SIZE_UV    nRESLEN_2                         // # of MRF's to hold U/V block data (2)
+#endif
+#if defined(IMC3_LOAD_9x5)
+    #define nUV_NUM_OF_ROWS     6                                 // Number of U/V rows per block (Rounded Up to Nearest Even Number)
+    #define nDPR_BLOCK_SIZE_UV  nBLOCK_WIDTH_12+nBLOCK_HEIGHT_5   // U/V block size 12x5
+    #define nDPR_MSG_SIZE_UV    nRESLEN_3                         // # of MRF's to hold U/V block data (3)
+#endif
+
+// Source/destination region definitions
+#if !defined(udSRC_Y)
+    #define udSRC_Y  udBOT_Y_IO     // Default Y source region is top Y region
+#endif
+
+#if !defined(udSRC_U)
+    #define udSRC_U  udBOT_U_IO     // Default U source region is top U region
+#endif
+
+#if !defined(udSRC_V)
+    #define udSRC_V  udBOT_V_IO     // Default V source region is top V region
+#endif
+
+#define ubSRC_Y      ubBOT_Y        // Loading data are always in byte type
+#define ubSRC_U      ubBOT_U
+#define ubSRC_V      ubBOT_V
+
+#define uwDEST_Y     uwTOP_Y        // However they can be transferred to word-aligned byte if desired
+#define uwDEST_U     uwTOP_U
+#define uwDEST_V     uwTOP_V
+
+#define nSRC_REGION  nREGION_1      // REGION_1 will be the source region for first kernel
+
+// End of PL3_Load.inc
diff --git a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm
new file mode 100644
index 0000000..653e634
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm
@@ -0,0 +1,86 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+#include "PL4x8_Save_NV11.inc"
+
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+
+///* Yoni - masking is not relevant for ILK?!? 
+//
+//        //Use the mask to determine which pixels shouldn't be over-written
+//        cmp.ge.f0.0     (1)             NULLREG         BLOCK_MASK_D:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+//        (f0.0)  jmpi WritePlanarToDataPort
+//
+//        //If mask is not all 1's, then load the entire 16x8 block
+//        //so that only those bytes may be modified that need to be (using the mask)
+//    send (8)    SRC_YD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00040000+BI_DEST_Y:ud         //16x8
+//        
+//    asr  (2)    MSGSRC.0<1>:ud  ORIX<2;2,1>:w   1:w     // U/V block origin should be half of Y's
+//    mov  (1)    MSGSRC.2<1>:ud  0x00030007:ud           // Block width and height (8x4)
+//    send (8)    SRC_UD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00010000+BI_DEST_U:ud
+//    send (8)    SRC_VD(0)<1>    MSGHDR  MSGSRC<8;8,1>:ud        DWBRMSGDSC+0x00010000+BI_DEST_V:ud
+//        
+//    //Restore the origin information
+//    mov (2)     MSGSRC.0<1>:ud  ORIX<2;2,1>:w           // Block origin
+//    mov (1)     MSGSRC.2<1>:ud  0x0007000F:ud           // Block width and height (16x8)
+//
+//        //expand U and V to be aligned on word boundary
+//        mov     (16)    SRC_UW(1)<1>            SRC_U(0,16)
+//        mov     (16)    SRC_UW(0)<1>            SRC_U(0, 0)
+//        mov (16)        SRC_VW(1)<1>            SRC_V(0,16)
+//        mov (16)        SRC_VW(0)<1>            SRC_V(0, 0)
+//        
+//        //Merge the data
+//        mov  (1)        f0.1:uw                 BLOCK_MASK_V:uw                 //Load the mask on flag reg
+//        (f0.1)  mov     (8)     TEMP0<1>:uw     BLOCK_MASK_H:uw
+//        (-f0.1) mov     (8)     TEMP0<1>:uw     0:uw
+//                
+//        // Destination is Word aligned
+//                $for(0; <Y_ROW_SIZE; 2) {
+//                        mov     (1)     f0.1:uw         TEMP(0,%1)<0;1,0>
+//                        (-f0.1) mov  (16)       DEST_Y(0, %1*32)<2>             SRC_Y(0, %1*16)
+//                        (-f0.1) mov  (16)       DEST_U(0, %1*8)<1>              SRC_U(0, %1*8)  //only works for Word aligned Byte data
+//                        (-f0.1) mov  (16)       DEST_V(0, %1*8)<1>              SRC_V(0, %1*8)  //only works for Word aligned Byte data
+//
+//                        mov     (1)     f0.1:uw         TEMP(0,1+%1)<0;1,0>
+//                        (-f0.1) mov  (16)       DEST_Y(0, 1+%1*32)<2>   SRC_Y(0, 1+%1*16)
+//
+//                }
+//
+//*/ Yoni - masking is not relevant for ILK?!? 
+        
+WritePlanarToDataPort:
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+
+// Save U/V data block in planar format (4x8) ----------------------------------
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+    asr (1)  rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+
+    $for(0,0; <nY_NUM_OF_ROWS;4,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+
+// End of PL4x8_Save_NV11
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc
new file mode 100644
index 0000000..ebd134e
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc
@@ -0,0 +1,60 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//Module name: PL8x4_Save_NV11.inc
+//
+// Setup for storing planar data
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8    // U/V interleaved block width and height (8x8)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_2                         // # of MRF's to hold U/V block data (2)
+
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+        
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+        
+#endif
+
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
+
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm
new file mode 100644
index 0000000..909f8a7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm
@@ -0,0 +1,29 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL5x8_PL16x8.asm
+
+#include "Expansion.inc"
+
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*32+16)    uwDEST_U(0, %1*16+7)<1;2,0>    uwDEST_U(0, %1*16+7)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32+16)    uwDEST_V(0, %1*16+7)<1;2,0>    uwDEST_V(0, %1*16+7)<1;2,1>
+        avg.sat (16) uwDEST_U(0, %1*32)    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32)    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*32+18)<1;2,0> uwDEST_U(0, %1*32+18)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*32+18)<1;2,0> uwDEST_V(0, %1*32+18)<1;2,1>
+        avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*32)<1;2,0> uwDEST_U(0, %1*32)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*32)<1;2,0> uwDEST_V(0, %1*32)<1;2,1>
+    }
+
+// End of PL5x8_PL16x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm
new file mode 100644
index 0000000..068b2ba
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm
@@ -0,0 +1,21 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL5x8_PL8x8.asm
+
+#include "Expansion.inc"
+
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        avg.sat (8) uwDEST_U(0, %1*8)    uwDEST_U(0, %1*8)<1;2,0>    uwDEST_U(0, %1*8)<1;2,1>
+        avg.sat (8) uwDEST_V(0, %1*8)    uwDEST_V(0, %1*8)<1;2,0>    uwDEST_V(0, %1*8)<1;2,1>
+    }
+
+// End of PL5x8_PL8x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm
new file mode 100644
index 0000000..c286992
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm
@@ -0,0 +1,88 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x4_Save_IMC3.asm
+//
+// Save planar YUV420 frame data block of size 16x8
+
+#include "PL8x4_Save_IMC3.inc"
+
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WritePlanarToDataPort
+
+    //If mask is not all 1's, then load the entire 16x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+
+    // Load 16x8 planar Y ----------------------------------------------------------
+    mov  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_Y(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+    // Load 8x4 planar U and V -----------------------------------------------------
+    asr  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov  (1) rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_UV:ud   // Block width and height (8x4)
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_U(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_U:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    send (8) udSRC_V(0)<1>      mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_V:ud
+
+    //expand U and V to be aligned on word boundary - Y remains in bytes
+    $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        mov (16)  uwSRC_U(0, %1*16)<1>    ubSRC_U(0, %1*16)
+        mov (16)  uwSRC_V(0, %1*16)<1>    ubSRC_V(0, %1*16)
+    }
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    // Destination is Word aligned
+    $for(0; <nY_NUM_OF_ROWS; 2) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    ub2DEST_Y(0, %1*32)<2>    ubSRC_Y(0, %1*16)
+        (-f0.1) mov (16)    ub2DEST_U(0, %1*8)<1>     ubSRC_U(0, %1*8)    //only works for Word aligned Byte data
+        (-f0.1) mov (16)    ub2DEST_V(0, %1*8)<1>     ubSRC_V(0, %1*8)    //only works for Word aligned Byte data
+
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,1+%1)<0;1,0>
+        (-f0.1) mov (16)    ub2DEST_Y(0, 1+%1*32)<2>  ubSRC_Y(0, 1+%1*16)
+    }
+
+WritePlanarToDataPort:
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+    mov (2)     rMSGSRC.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov (1)     rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+    mov (8)     mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+        mov(16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+        mov(16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+// Save U/V data block in planar format (8x4) ----------------------------------
+    asr  (2)    rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov  (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_UV:ud   // Block width and height (8x4)
+    mov  (8)    mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+// Save U picture data ---------------------------------------------------------
+    mov (16)    mubMSGPAYLOAD(0,0)<1>      ub2DEST_U(0)REGION(16,2)   // U rows 0,1
+    mov (16)    mubMSGPAYLOAD(0,16)<1>     ub2DEST_U(1)REGION(16,2)   // U rows 2,3
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_U:ud
+    mov  (8)    mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+// Save V picture data ---------------------------------------------------------
+    mov  (16)   mubMSGPAYLOAD(0,0)<1>      ub2DEST_V(0)REGION(16,2)   // V rows 0,1
+    mov  (16)   mubMSGPAYLOAD(0,16)<1>     ub2DEST_V(1)REGION(16,2)   // V rows 2,3
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_V:ud
+
+// End of PL8x4_Save_IMC3
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc
new file mode 100644
index 0000000..3b1df17
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc
@@ -0,0 +1,62 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x4_Save_IMC3.inc
+//
+// Setup for storing planar data
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+// For saving
+#define nDPW_BLOCK_SIZE_Y        nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8    // Y block size 16x8
+#define nDPW_MSG_SIZE_Y          nMSGLEN_4                          // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV       nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4     // U/V block size 8x4
+#define nDPW_MSG_SIZE_UV         nMSGLEN_1                          // # of MRF's to hold U/V block data (1)
+
+// For masking
+#undef  nDPR_MSG_SIZE_Y
+#define nDPR_MSG_SIZE_Y      nRESLEN_4        // # of MRF's to hold Y block data (4)
+#undef  nDPR_MSG_SIZE_UV
+#define nDPR_MSG_SIZE_UV     nRESLEN_1        // # of MRF's to hold U/V block data (1)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define ub2DEST_Y        ub2TOP_Y
+    #define ub2DEST_U        ub2TOP_U
+    #define ub2DEST_V        ub2TOP_V
+    //For masking operation
+    #define udSRC_Y          udBOT_Y_IO
+    #define udSRC_U          udBOT_U_IO
+    #define udSRC_V          udBOT_V_IO
+    #define ubSRC_Y          ubBOT_Y
+    #define ubSRC_U          ubBOT_U
+    #define ubSRC_V          ubBOT_V
+    #define uwSRC_U          uwBOT_U
+    #define uwSRC_V          uwBOT_V
+
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define ub2DEST_Y        ub2BOT_Y
+    #define ub2DEST_U        ub2BOT_U
+    #define ub2DEST_V        ub2BOT_V
+    //For masking operation
+    #define udSRC_Y          udTOP_Y_IO
+    #define udSRC_U          udTOP_U_IO
+    #define udSRC_V          udTOP_V_IO
+    #define ubSRC_Y          ubTOP_Y
+    #define ubSRC_U          ubTOP_U
+    #define ubSRC_V          ubTOP_V
+    #define uwSRC_U          uwTOP_U
+    #define uwSRC_V          uwTOP_V
+
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm
new file mode 100644
index 0000000..b54a316
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm
@@ -0,0 +1,102 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+// Module name: PL8x4_Save_NV12.asm
+//
+// Save entire current planar frame data block of size 16x8
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//      DWORD_ALIGNED_DEST:     only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//      ORIX:
+//---------------------------------------------------------------
+
+#include "PL8x4_Save_NV12.inc"
+
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+#endif
+
+//Use the mask to determine which pixels shouldn't be over-written
+	and	(1)		acc0<1>:ud		udBLOCK_MASK<0;1,0>:ud		0x00FFFFFF:ud
+	cmp.ge.f0.0	(1)		dNULLREG		acc0<0;1,0>:ud		0x00FFFFFF:ud	//Check if all pixels in the block need to be modified
+	(f0.0)	jmpi WritePlanarToDataPort
+
+//If mask is not all 1's, then load the entire 16x8 block
+//so that only those bytes may be modified that need to be (using the mask)	
+  send (8)	udSRC_Y(0)<1>	mMSGHDR	udDUMMY_NULL nDATAPORT_READ	nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud		//16x8  
+    
+  asr  (1)	rMSGSRC.1<1>:ud	wORIY<0;1,0>:w	1:w	{ NoDDClr }	// U/V block origin should be half of Y's
+  mov  (1)	rMSGSRC.2<1>:ud	nDPW_BLOCK_SIZE_UV:ud		{ NoDDChk }	// Block width and height (16x4)
+  mov (8)  mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud //move message desrcptor to the message header
+  send (8)	udSRC_U(0)<1>	mMSGHDR	udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_UV:ud     	                                                   
+    	
+//Restore the origin information
+  mov (2)	rMSGSRC.0<1>:ud	wORIX<2;2,1>:w		// Block origin
+  mov (1)	rMSGSRC.2<1>:ud	nDPW_BLOCK_SIZE_Y:ud		// Block width and height (16x8)
+  mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud //move message desrcptor to the message header	
+	
+//Merge the data
+	mov  (1)	f0.1:uw			ubBLOCK_MASK_V:ub			//Load the mask on flag reg
+	(f0.1)	mov	(8)	rMASK_TEMP<1>:uw	uwBLOCK_MASK_H:uw
+	(-f0.1)	mov	(8)	rMASK_TEMP<1>:uw	0:uw  
+    
+//convert the mask from 16bits to 8bits by selecting every other bit
+	mov (1) udMASK_TEMP1(0,0)<1> 0x00040001:ud 
+	mov (1) udMASK_TEMP1(0,1)<1> 0x00400010:ud
+	mov (1) udMASK_TEMP1(0,2)<1> 0x04000100:ud
+	mov (1) udMASK_TEMP1(0,3)<1> 0x40001000:ud
+
+//merge the loaded block with the current block
+  $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+  	mov	(1)	f0.1:uw		uwMASK_TEMP(0, %1)<0;1,0>
+		(-f0.1)	mov  (16)	ubDEST_Y(0,%1*32)<2>		ubSRC_Y(0,%1*16)		
+
+	  and.nz.f0.1 (8) wNULLREG uwMASK_TEMP(0,%1)<0;1,0> uwMASK_TEMP1(0,0) //change the mask by selecting every other bit
+		(-f0.1)	mov  (8)	ubDEST_U(0, %2*16)<2>		ub2SRC_U(0, %1*8)<16;8,2>
+		(-f0.1)	mov  (8)	ubDEST_V(0, %2*16)<2>		ub2SRC_U(0, %1*8+1)<16;8,2>
+		
+		mov	(1)	f0.1:uw		uwMASK_TEMP(0,1+%1)<0;1,0>
+		(-f0.1)	mov  (16)	ubDEST_Y(0, (1+%1)*32)<2>	ubSRC_Y(0, (1+%1)*16)		
+  
+  }	 
+
+WritePlanarToDataPort:
+#if !defined(SAVE_UV_ONLY)
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+    
+//** Save  8x4 packed U and V -----------------------------------------------------
+// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
+// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
+//it is possible to do asr on mMSGHDR so we use rMSGSRC.
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+    asr (1)  rMSGSRC.1<1>:d    rMSGSRC.1<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud   nDPW_BLOCK_SIZE_UV:ud      // U/V block width and height (16x4)
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+
+    $for(0,0; <nY_NUM_OF_ROWS;4,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+
+// End of PL8x4_Save_NV12  
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc
new file mode 100644
index 0000000..879d7e3
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc
@@ -0,0 +1,85 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//Module name: PL8x4_Save_NV12.inc
+//
+// Setup for storing planar data
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+#undef nDPW_BLOCK_SIZE_Y
+#undef nDPW_MSG_SIZE_Y
+#undef nDPW_BLOCK_SIZE_UV
+#undef nDPW_MSG_SIZE_UV
+
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4    // U/V interleaved block width and height (16x4)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_2                         // # of MRF's to hold U/V block data (2)
+
+// For masking
+#undef  nDPR_MSG_SIZE_Y
+#define nDPR_MSG_SIZE_Y        nRESLEN_4                          // # of MRF's to hold Y block data (4)
+#undef  nDPR_MSG_SIZE_UV
+#define nDPR_MSG_SIZE_UV       nRESLEN_2  
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+#define     rMASK_TEMP1     REG(r,nTEMP1)
+.declare    udMASK_TEMP1    Base=rMASK_TEMP1    ElementSize=4    SrcRegion=<4;4,1>    Type=ud        //1 GRF
+.declare    uwMASK_TEMP1    Base=rMASK_TEMP1    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+	
+	      #define ubDEST_Y		            ubTOP_Y	
+      	#define ubDEST_U		            ubTOP_U	
+        #define ubDEST_V		            ubTOP_V
+        
+        #define ub2SRC_U			          ub2BOT_U
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+
+	      #define ubDEST_Y		            ubBOT_Y	
+      	#define ubDEST_U		            ubBOT_U	
+        #define ubDEST_V		            ubBOT_V        
+        
+        #define ub2SRC_U			          ub2TOP_U
+#endif
+
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm
new file mode 100644
index 0000000..5b98be0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm
@@ -0,0 +1,27 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x5_PL8x8.asm
+
+#include "Expansion.inc"
+
+//------------------------------- Vertical Upconversion ------------------------------
+    avg.sat (8) uwDEST_U(0, 3*16+8)<1>   uwDEST_U(0, 3*8)    uwDEST_U(0, (1+3)*8)    // Optimization
+    avg.sat (8) uwDEST_V(0, 3*16+8)<1>   uwDEST_V(0, 3*8)    uwDEST_V(0, (1+3)*8)    // Optimization
+
+    $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
+        mov     (8) uwDEST_U(0, (1+%1)*16)<1>    uwDEST_U(0, (1+%1)*8)
+        avg.sat (8) uwDEST_U(0, %1*16+8)<1>   uwDEST_U(0, %1*8)    uwDEST_U(0, (1+%1)*8)
+
+        mov     (8) uwDEST_V(0, (1+%1)*16)<1>    uwDEST_V(0, (1+%1)*8)
+        avg.sat (8) uwDEST_V(0, %1*16+8)<1>   uwDEST_V(0, %1*8)    uwDEST_V(0, (1+%1)*8)
+    }
+
+// End of PL8x5_PL8x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm
new file mode 100644
index 0000000..f21d224
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm
@@ -0,0 +1,30 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x8_PL8x4.asm
+//
+// Convert PL 8x8 to PL8x4 in GRF
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//	DWORD_ALIGNED_DEST:	only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//	ORIX:
+//---------------------------------------------------------------
+
+#include "PL8x8_PL8x4.inc"
+  
+// Convert PL8x8 to PL8x4 ---------------------------------------------------------
+
+  mov (8) ubDEST_U(0,16)<2> ubDEST_U(1)<16;8,2> //selecting U every other row
+  mov (16) ubDEST_U(0,32)<2> ubDEST_U(2)<32;8,2> //selecting U every other row
+  mov (8) ubDEST_V(0,16)<2> ubDEST_V(1)<16;8,2> //selecting V every other row
+  mov (16) ubDEST_V(0,32)<2> ubDEST_V(2)<32;8,2> //selecting V every other row
+  
+// End of PL8x8_PL8x4.asm -------------------------------------------------------
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc
new file mode 100644
index 0000000..bec884e
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc
@@ -0,0 +1,36 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x8_PL8x4.inc
+//
+// Setup module for convert PL8x8 to PL8x4 
+//
+// 
+
+// Source/destination region definitions
+//
+#include "undefall.inc"				//Undefine the SRC and DEST sysmbols
+
+#if (nSRC_REGION==nREGION_1)
+
+	//REGION_1 selected
+	#define ubDEST_Y		ubTOP_Y		
+	#define ubDEST_U		ubTOP_U	
+  #define ubDEST_V		ubTOP_V
+	
+#elif (nSRC_REGION==nREGION_2)
+
+	//REGION_2 selected	
+	#define ubDEST_Y		ubBOT_Y	
+	#define ubDEST_U		ubBOT_U	
+	#define ubDEST_V		ubBOT_V
+  
+	
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm
new file mode 100644
index 0000000..6b3258f
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm
@@ -0,0 +1,56 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+// Module name: PL8x8_Save_P208.asm
+//
+// Save entire current planar frame data block of size 16x8
+//---------------------------------------------------------------
+//  Symbols needed to be defined before including this module
+//
+//      DWORD_ALIGNED_DEST:     only if DEST_Y, DEST_U, DEST_V data are DWord aligned
+//      ORIX:
+//---------------------------------------------------------------
+
+#include "PL8x8_Save_P208.inc"
+
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+#if !defined(SAVE_UV_ONLY)
+// Save current planar frame Y block data (16x8) -------------------------------
+
+    mov  (2) mMSGHDR.0<1>:d     wORIX<2;2,1>:w          // Block origin
+    mov  (1) mMSGHDR.2<1>:ud    nDPW_BLOCK_SIZE_Y:ud    // Block width and height (16x8)
+
+WritePlanarToDataPort:
+    $for(0,0; <nY_NUM_OF_ROWS; 2,1) {
+            mov (16) mubMSGPAYLOAD(%2,0)<1>     ub2DEST_Y(%1)REGION(16,2)
+            mov (16) mubMSGPAYLOAD(%2,16)<1>    ub2DEST_Y(%1+1)REGION(16,2)
+    } 
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud
+#endif
+    
+//** Save  8x8 packed U and V -----------------------------------------------------
+// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could
+// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether
+//it is possible to do asr on mMSGHDR so we use rMSGSRC.
+    mov (2)  rMSGSRC.0<1>:d    wORIX<2;2,1>:w             // Block origin
+                                                                                                        
+    mov (1)  rMSGSRC.2<1>:ud   nDPW_BLOCK_SIZE_UV:ud      // U/V block width and height (16x4)
+    mov (8)  mMSGHDR<1>:ud     rMSGSRC<8;8,1>:ud
+
+    $for(0,0; <nY_NUM_OF_ROWS;2,1) {
+        mov (16) mubMSGPAYLOAD(%2,0)<2>     ub2DEST_U(%2)REGION(16,2) 
+        mov (16) mubMSGPAYLOAD(%2,1)<2>     ub2DEST_V(%2)REGION(16,2) 
+    }
+    send (8)    dNULLREG    mMSGHDR    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud
+
+//End of PL8x8_Save_P208.asm  
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc
new file mode 100644
index 0000000..e3b7d09
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc
@@ -0,0 +1,61 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+//Module name: PL8x8_Save_P208.inc
+//
+// Setup for storing planar data
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+#define nDPW_BLOCK_SIZE_Y   nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // Y block size 16x8
+#define nDPW_MSG_SIZE_Y     nMSGLEN_4                         // # of MRF's to hold Y block data (4)
+#define nDPW_BLOCK_SIZE_UV  nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8    // U/V interleaved block width and height (16x8)
+#define nDPW_MSG_SIZE_UV    nMSGLEN_4                         // # of MRF's to hold U/V block data (4)
+
+#if (nSRC_REGION==nREGION_1)
+        #define udSRC_Y                 udBOT_Y_IO
+        #define udSRC_U                 udBOT_U_IO
+        #define udSRC_V                 udBOT_V_IO
+        #define ubSRC_Y                 ubBOT_Y
+        #define ubSRC_U                 ubBOT_U
+        #define ubSRC_V                 ubBOT_V
+
+        #define uwSRC_U                 uwBOT_U  //For masking operation
+        #define uwSRC_V                 uwBOT_V
+
+        #define ub2DEST_Y               ub2TOP_Y
+        #define ub2DEST_U               ub2TOP_U
+        #define ub2DEST_V               ub2TOP_V
+        
+#elif (nSRC_REGION==nREGION_2)
+        #define udSRC_Y                 udTOP_Y_IO
+        #define udSRC_U                 udTOP_U_IO
+        #define udSRC_V                 udTOP_V_IO
+        #define ubSRC_Y                 ubTOP_Y
+        #define ubSRC_U                 ubTOP_U
+        #define ubSRC_V                 ubTOP_V
+
+        #define uwSRC_U                 uwTOP_U  //For masking operation
+        #define uwSRC_V                 uwTOP_V
+
+        #define ub2DEST_Y               ub2BOT_Y
+        #define ub2DEST_U               ub2BOT_U
+        #define ub2DEST_V               ub2BOT_V
+        
+#endif
+
+///* Yoni - masking is not relevant for ILK?!? 
+//#define         TEMP0   REG(r,54)
+//.declare    TEMP        Base=TEMP0      ElementSize=2   SrcRegion=<8;8,1>       Type=uw
+///* Yoni - masking is not relevant for ILK?!? 
+
+
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm
new file mode 100644
index 0000000..d22c76d
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm
@@ -0,0 +1,71 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x8_Save_PA.asm
+//
+// Save planar YUV422 to packed YUV422 format data
+//
+// Note: SRC_* must reference to regions with data type "BYTE"
+//               in order to save to byte-aligned byte location
+
+#include "PL8x8_Save_PA.inc"
+
+    add (4) pCF_Y_OFFSET<1>:uw   ubDEST_CF_OFFSET<4;4,1>:ub   nDEST_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+
+    // Pack Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2>    ubSRC_Y(0,%1*32)
+    }
+
+    // Pack U/V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8)  r[pCF_U_OFFSET, %1*nGRFWIB]<4>    ubSRC_U(0, %1*16)
+        mov (8)  r[pCF_V_OFFSET, %1*nGRFWIB]<4>    ubSRC_V(0, %1*16)
+    }
+
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            1:w  { NoDDClr }             // H. block origin need to be doubled
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_YUV:ud         { NoDDChk }             // Block width and height (32x8)
+
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WritePackedToDataPort
+
+    //If mask is not all 1's, then load the entire 32x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+
+    // Load 32x8 packed YUV 422 ----------------------------------------------------
+    send (8) udSRC_YUV(0)<1>    mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    // Destination is Byte aligned
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    uwDEST_YUV(%1)<1>         uwSRC_YUV(%1)        //check the UV merge - vK
+    }
+
+WritePackedToDataPort:
+    //  Packed YUV data are stored in one of the I/O regions before moving to MRF
+    //  Note: This is necessary since indirect addressing is not supported for MRF. 
+    //  Packed data block should be saved as 32x8 pixel block
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud
+
+// End of PL8x8_Save_PA
diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc
new file mode 100644
index 0000000..a5cb4a3
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc
@@ -0,0 +1,52 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL8x8_Save_PA.inc
+//
+// Setup for storing packed data
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+// For saving
+#define nDPW_BLOCK_SIZE_YUV      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // YUV block size 32x8
+#define nDPW_MSG_SIZE_YUV        nMSGLEN_8                          // # of MRF's to hold YUV block data (8)
+
+// For masking
+#undef  nDPR_MSG_SIZE_YUV
+#define nDPR_MSG_SIZE_YUV        nRESLEN_8                          // # of MRF's to hold YUV block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udSRC_YUV        udTOP_Y_IO
+    #define udDEST_YUV       udBOT_Y_IO
+    #define nDEST_YUV_REG    nBOT_Y
+    //For masking operation
+    #define ubSRC_Y          ub2TOP_Y
+    #define ubSRC_U          ub2TOP_U
+    #define ubSRC_V          ub2TOP_V
+    #define uwSRC_YUV        uwTOP_Y
+    #define uwDEST_YUV       uwBOT_Y
+
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udSRC_YUV        udBOT_Y_IO
+    #define udDEST_YUV       udTOP_Y_IO
+    #define nDEST_YUV_REG    nTOP_Y
+    //For masking operation
+    #define ubSRC_Y          ub2BOT_Y
+    #define ubSRC_U          ub2BOT_U
+    #define ubSRC_V          ub2BOT_V
+    #define uwSRC_YUV        uwBOT_Y
+    #define uwDEST_YUV       uwTOP_Y
+
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm
new file mode 100644
index 0000000..697454f
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm
@@ -0,0 +1,37 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL9x5_PL16x8.asm
+
+#define EXPAND_9x5
+#include "Expansion.inc"
+
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (nUV_NUM_OF_ROWS-2; >-1; -1) {
+        avg.sat (16) uwDEST_U(0, %1*16)<1>    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*16)<1>    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+
+#undef 	nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS		8	//use packed version of all post-processing kernels
+
+//------------------------------- Vertical Upconversion ------------------------------
+    avg.sat (16) uwDEST_U(0, 3*32+16)<1>   uwDEST_U(0, 3*16)    uwDEST_U(0, (1+3)*16)
+    avg.sat (16) uwDEST_V(0, 3*32+16)<1>   uwDEST_V(0, 3*16)    uwDEST_V(0, (1+3)*16)
+
+    $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) {
+        mov     (16) uwDEST_U(0, (1+%1)*32)<1>    uwDEST_U(0, (1+%1)*16)
+        avg.sat (16) uwDEST_U(0, %1*32+16)<1>   uwDEST_U(0, %1*16)    uwDEST_U(0, (1+%1)*16)
+
+        mov     (16) uwDEST_V(0, (1+%1)*32)<1>    uwDEST_V(0, (1+%1)*16)
+        avg.sat (16) uwDEST_V(0, %1*32+16)<1>   uwDEST_V(0, %1*16)    uwDEST_V(0, (1+%1)*16)
+    }
+
+// End of PL9x5_PL16x8
diff --git a/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm
new file mode 100644
index 0000000..b0fa549
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm
@@ -0,0 +1,21 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: PL9x5_PL16x8.asm
+
+#include "Expansion.inc"
+
+//------------------------------ Horizontal Upconversion -----------------------------
+    $for (0; <nUV_NUM_OF_ROWS; 1) {
+        avg.sat (16) uwDEST_U(0, %1*16)<1>    uwDEST_U(0, %1*16)<1;2,0>    uwDEST_U(0, %1*16)<1;2,1>
+        avg.sat (16) uwDEST_V(0, %1*16)<1>    uwDEST_V(0, %1*16)<1;2,0>    uwDEST_V(0, %1*16)<1;2,1>
+    }
+
+// End of PL9x5_PL16x8
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm
new file mode 100644
index 0000000..7903d63
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm
@@ -0,0 +1,88 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_RGB.asm
+//
+// Save packed ARGB 444 frame data block of size 16x8
+//
+// To save 16x8 block (64x8 byte layout for ARGB8888) we need 2 send instructions
+//  ---------
+//  | 1 | 2 |
+//  --------- 
+
+#include "RGB16x8_Save_RGB.inc"
+
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            2:w  { NoDDClr }             // H. block origin need to be quadrupled
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin (1st quadrant)
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_ARGB:ud        { NoDDChk }             // Block width and height (32x8)
+
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteARGBToDataPort
+
+    //If mask is not all 1's, then load the entire 64x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+
+    // Load first block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF00FF00:ud   //Check first block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi SkipFirstBlockMerge                                     //If full mask then skip this block
+
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+
+SkipFirstBlockMerge:
+    // Load second block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF0000FF:ud   //Check second block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi WriteARGBToDataPort                                     //If full mask then skip this block
+
+    add  (1) mMSGHDR.0<1>:d     rMSGSRC.0<0;1,0>:d       32:d     // Point to 2nd part
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR    udDUMMY_NULL  nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud                 // Point to 1st part again
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  shr (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    8:uw    //load the mask for second block
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+
+WriteARGBToDataPort:
+    // Move packed data to MRF and output
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       32:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*2+1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+// End of RGB16x8_Save_RGB
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc
new file mode 100644
index 0000000..3dee653
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc
@@ -0,0 +1,38 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_RGB.inc
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+// For saving
+#define nDPW_BLOCK_SIZE_ARGB     nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // ARGB block size 32x8
+#define nDPW_MSG_SIZE_ARGB       nMSGLEN_8                          // # of MRF's to hold ARGB block data (8)
+
+// For masking
+#undef  nDPR_MSG_SIZE_ARGB
+#define nDPR_MSG_SIZE_ARGB       nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udDEST_ARGB      udTOP_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udDEST_ARGB      udBOT_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm
new file mode 100644
index 0000000..3fbb9eb
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm
@@ -0,0 +1,72 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_RGB16.asm
+//
+// Save packed RGB565 frame data block of size 16x8
+//
+// To save 16x8 block (32x8 byte layout for RGB565) we need 1 send instruction
+//  -----
+//  | 1 |
+//  ----- 
+
+#include "RGB16x8_Save_RGB16.inc"
+
+//convert 32 bit RGB to 16 bit RGB
+    // Truncate A8R8G8B8 to A6R5G6B5 within byte.
+    // That is keeping 5 MSB of R and B, and 6 MSB of G.
+
+    $for (0, 0; <nY_NUM_OF_ROWS; 1, 2) {
+        shr     uwCSC_TEMP(%1,0)<1>    ubDEST_ARGB(%2,0)<32;8,4>   3:w                // B >> 3
+
+        shl (16) uwTEMP_RGB16(0)<1>    uwDEST_ARGB(%2,1)<16;8,2>   8:w                // R << 8
+        and (16) uwTEMP_RGB16(0)<1>    uwTEMP_RGB16(0)             0xF800:uw
+        or  (16) uwCSC_TEMP(%1,0)<1>   uwCSC_TEMP(%1,0)<16;16,1>   uwTEMP_RGB16(0)
+
+        shr (16) uwTEMP_RGB16(0)<1>    uwDEST_ARGB(%2,0)<16;8,2>   5:w                // G >> 5
+        and (16) uwTEMP_RGB16(0)<1>    uwTEMP_RGB16(0)             0x07E0:uw
+        or  (16) uwCSC_TEMP(%1,0)<1>   uwCSC_TEMP(%1,0)<16;16,1>   uwTEMP_RGB16(0)
+    }
+
+    mov (2) rMSGSRC.0<1>:d      wORIX<2;2,1>:w                      // Block origin (1st quadrant)
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w              1:w     // H. block origin need to be doubled for byte offset
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_RGB16:ud            // Block width and height (32x8)
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteRGB16ToDataPort
+
+    //If mask is not all 1's, then load the entire 32x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+
+    // Load 32x8 packed RGB565 -----------------------------------------------------
+    send (8) udSRC_RGB16(0)<1>  mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (16)    uwCSC_TEMP(%1)<1>         uwSRC_RGB16(%1)
+    }
+
+WriteRGB16ToDataPort:
+    // Move packed data to MRF and output
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udCSC_TEMP(%1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud
+
+// End of RGB16x8_Save_RGB16
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc
new file mode 100644
index 0000000..8161432
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc
@@ -0,0 +1,49 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_RGB16.inc
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+// For saving
+#define nDPW_BLOCK_SIZE_RGB16    nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // RGB16 block size 32x8
+#define nDPW_MSG_SIZE_RGB16      nMSGLEN_8                          // # of MRF's to hold RGB16 block data (8)
+
+// For conversion to 16bit
+.declare	uwTEMP_RGB16    Base=REG(r,nTEMP1)	ElementSize=2 SrcRegion=<16;16,1>	Type=uw		//1 GRF
+
+// For masking
+#undef  nDPR_MSG_SIZE_RGB16
+#define nDPR_MSG_SIZE_RGB16      nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define ubDEST_ARGB      ubTOP_Y       //Data from previous module
+    #define uwDEST_ARGB      uwTOP_Y       //Data from previous module
+    #define udCSC_TEMP       udBOT_Y_IO    //Data Converted to 16 bits
+    #define uwCSC_TEMP       uwBOT_Y
+    //For masking operation
+    #define udSRC_RGB16      udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+    #define uwSRC_RGB16      uwTOP_Y
+
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define ubDEST_ARGB      ubBOT_Y       //Data from previous module
+    #define uwDEST_ARGB      uwBOT_Y       //Data from previous module
+    #define udCSC_TEMP       udTOP_Y_IO    //Data Converted to 16 bits
+    #define uwCSC_TEMP       uwTOP_Y
+    //For masking operation
+    #define udSRC_RGB16      udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+    #define uwSRC_RGB16      uwBOT_Y
+
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm
new file mode 100644
index 0000000..915f797
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm
@@ -0,0 +1,107 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_Y416.asm
+//
+// Save packed ARGB 444 frame data block of size 16x8
+//
+// To save 16x8 block (128x8 byte layout for ARGB 16bit per component) we need 4 send instructions
+//  -----------------
+//  | 1 | 2 | 3 | 4 |
+//  ----------------- 
+
+#include "RGB16x8_Save_RGB.inc"
+
+    shl (1) rMSGSRC.0<1>:d      wORIX<0;1,0>:w            3:w  { NoDDClr }             // H. block origin need to become 8 times
+    mov (1) rMSGSRC.1<1>:d      wORIY<0;1,0>:w                 { NoDDClr, NoDDChk }    // Block origin (1st quadrant)
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_ARGB:ud        { NoDDChk }             // Block width and height (32x8)
+
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+/*	Not needed for validation kernels for now -vK
+//Use the mask to determine which pixels shouldn't be over-written
+    and (1)        acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0x00FFFFFF:ud
+    cmp.ge.f0.0(1) dNULLREG     acc0.0<0;1,0>:ud         0x00FFFFFF:ud   //Check if all pixels in the block need to be modified
+    (f0.0)  jmpi WriteARGBToDataPort
+
+    //If mask is not all 1's, then load the entire 64x8 block
+    //so that only those bytes may be modified that need to be (using the mask)
+
+    // Load first block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF00FF00:ud   //Check first block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi SkipFirstBlockMerge                                     //If full mask then skip this block
+
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR     udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  mov (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    //use sel instruction - vK
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+
+SkipFirstBlockMerge:
+    // Load second block 16x8 packed ARGB 444 ---------------------------------------
+    or (1)         acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud   0xFF0000FF:ud   //Check second block
+    cmp.e.f0.0 (1) dNULLREG     acc0.0<0;1,0>:ud         0xFFFFFFFF:ud   
+    (f0.0)  jmpi WriteARGBToDataPort                                     //If full mask then skip this block
+
+    add  (1) mMSGHDR.0<1>:d     rMSGSRC.0<0;1,0>:d       32:d     // Point to 2nd part
+    send (8) udSRC_ARGB(0)<1>   mMSGHDR    udDUMMY_NULL  nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+    mov  (8) mMSGHDR<1>:ud      rMSGSRC<8;8,1>:ud                 // Point to 1st part again
+
+    //Merge the data
+    mov (1)           f0.0:uw             ubBLOCK_MASK_V:ub    //Load the mask on flag reg
+    (f0.0)  shr (8)   rMASK_TEMP<1>:uw    uwBLOCK_MASK_H:uw    8:uw    //load the mask for second block
+    (-f0.0) mov (8)   rMASK_TEMP<1>:uw    0:uw
+
+    $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) {               //take care of the lines in the block, they are different in the src and dest
+        mov (1)             f0.1:uw                   uwMASK_TEMP(0,%1)<0;1,0>
+        (-f0.1) mov (8)     udDEST_ARGB(%2)<1>        udSRC_ARGB(%1) 
+    }
+*/
+WriteARGBToDataPort:
+    // Move packed data to MRF and output
+    
+    //Write 1st 4X8 pixels  
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+	//Write 2nd 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       32:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+	//Write 3rd 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       64:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+2)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+	//Write 4th 4X8 pixels  
+    mov  (8)    mMSGHDR<1>:ud         rMSGSRC<8;8,1>:ud
+    add  (1)    mMSGHDR.0<1>:d        rMSGSRC.0<0;1,0>:d       96:d   // Point to 2nd part
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+        mov (8) mudMSGPAYLOAD(%1)<1>       udDEST_ARGB(%1*4+3)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud
+
+// End of RGB16x8_Save_Y416
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc
new file mode 100644
index 0000000..b6b45c4
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc
@@ -0,0 +1,38 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGB16x8_Save_Y416.inc
+//
+
+#include "undefall.inc"                 //Undefine the SRC and DEST sysmbols
+
+// For saving
+#define nDPW_BLOCK_SIZE_ARGB     nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8    // ARGB block size 32x8
+#define nDPW_MSG_SIZE_ARGB       nMSGLEN_8                          // # of MRF's to hold ARGB block data (8)
+
+// For masking
+#undef  nDPR_MSG_SIZE_ARGB
+#define nDPR_MSG_SIZE_ARGB       nRESLEN_8                          // # of MRF's to hold ARGB block data (8)
+#define     rMASK_TEMP     REG(r,nTEMP0)
+.declare    uwMASK_TEMP    Base=rMASK_TEMP    ElementSize=2    SrcRegion=<8;8,1>    Type=uw        //1 GRF
+
+#if (nSRC_REGION==nREGION_1)
+    // For saving
+    #define udDEST_ARGB      udTOP_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udBOT_Y_IO    //To hold the destination data that shouldn't be modified
+
+#elif (nSRC_REGION==nREGION_2)
+    // For saving
+    #define udDEST_ARGB      udBOT_Y_IO    //The output of previous stage is stored here; This is modified and is written to render cache
+    //For masking operation
+    #define udSRC_ARGB       udTOP_Y_IO    //To hold the destination data that shouldn't be modified
+
+#endif
diff --git a/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm b/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm
new file mode 100644
index 0000000..063e256
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm
@@ -0,0 +1,40 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+.declare SRC_B		Base=REG(r,10)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_G		Base=REG(r,18)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_R		Base=REG(r,26)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare SRC_A		Base=REG(r,34)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+
+#define DEST_ARGB		ubBOT_ARGB
+
+#undef 	nSRC_REGION
+#define nSRC_REGION		nREGION_2
+
+
+//Pack directly to mrf as optimization - vK
+
+$for(0, 0; <8; 1, 2) {
+//	mov	(16) 	DEST_ARGB(%2,0)<4>		SRC_B(%1) 					{ Compr, NoDDClr }			// 16 B
+//	mov	(16) 	DEST_ARGB(%2,1)<4>		SRC_G(%1)					{ Compr, NoDDClr, NoDDChk }	// 16 G
+//	mov	(16) 	DEST_ARGB(%2,2)<4>		SRC_R(%1)					{ Compr, NoDDClr, NoDDChk }	// 16 R	//these 2 inst can be merged - vK
+//	mov	(16) 	DEST_ARGB(%2,3)<4>		SRC_A(%1)					{ Compr, NoDDChk }			//DEST_RGB_FORMAT<0;1,0>:ub	{ Compr, NoDDChk }			// 16 A
+
+	mov	(8) 	DEST_ARGB(%2,  0)<4>		SRC_B(%1) 					{ NoDDClr }				// 8 B
+	mov	(8) 	DEST_ARGB(%2,  1)<4>		SRC_G(%1)					{ NoDDClr, NoDDChk }	// 8 G
+	mov	(8) 	DEST_ARGB(%2,  2)<4>		SRC_R(%1)					{ NoDDClr, NoDDChk }	// 8 R
+	mov	(8) 	DEST_ARGB(%2,  3)<4>		SRC_A(%1)					{ NoDDChk }				// 8 A
+
+	mov	(8) 	DEST_ARGB(%2+1,0)<4>		SRC_B(%1,8) 				{ NoDDClr }				// 8 B
+	mov	(8) 	DEST_ARGB(%2+1,1)<4>		SRC_G(%1,8)					{ NoDDClr, NoDDChk }	// 8 G
+	mov	(8) 	DEST_ARGB(%2+1,2)<4>		SRC_R(%1,8)					{ NoDDClr, NoDDChk }	// 8 R
+	mov	(8) 	DEST_ARGB(%2+1,3)<4>		SRC_A(%1,8)					{ NoDDChk }				// 8 A
+}
diff --git a/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm b/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm
new file mode 100644
index 0000000..6375b0c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm
@@ -0,0 +1,34 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Modual name: SetupVPKernel.asm
+//
+// Initial setup for running video-processing kernels
+//
+
+#include "common.inc"
+
+//
+//  Now, begin source code....
+//
+.code
+
+#include "Init_All_Regs.asm"
+
+mov (8)     rMSGSRC.0<1>:ud  r0.0<8;8,1>:ud  // Initialize message payload header with R0
+#if	defined (INC_BLENDING)
+    mul	(1)	fALPHA_STEP_X:f   fSCALING_STEP_RATIO:f 	fVIDEO_STEP_X:f	//StepX_ratio = AlphaStepX / VideoStepX
+#endif
+
+// End of SetupVPKernel
+
+
+ 
+       
diff --git a/i965_drv_video/shaders/post_processing/Common/common.inc b/i965_drv_video/shaders/post_processing/Common/common.inc
new file mode 100644
index 0000000..a0a66a0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/common.inc
@@ -0,0 +1,610 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#ifndef COMMON_INC
+#define COMMON_INC
+
+// Module name: common.inc
+//
+// Common header file for all Video-Processing kernels
+//
+
+.default_execution_size (16)
+.default_register_type  :ub
+
+.reg_count_total        80
+.reg_count_payload      4
+
+
+//========== Common constants ==========
+
+// Bit position constants 
+#define BIT0    0x01
+#define BIT1    0x02
+#define BIT2    0x04
+#define BIT3    0x08
+#define BIT4    0x10
+#define BIT5    0x20
+#define BIT6    0x40
+#define BIT7    0x80
+#define BIT8    0x0100
+#define BIT9    0x0200
+#define BIT10   0x0400
+#define BIT11   0x0800
+#define BIT12   0x1000
+#define BIT13   0x2000
+#define BIT14   0x4000
+#define BIT15   0x8000
+#define BIT16   0x00010000
+#define BIT17   0x00020000
+#define BIT18   0x00040000
+#define BIT19   0x00080000
+#define BIT20   0x00100000
+#define BIT21   0x00200000
+#define BIT22   0x00400000
+#define BIT23   0x00800000
+#define BIT24   0x01000000
+#define BIT25   0x02000000
+#define BIT26   0x04000000
+#define BIT27   0x08000000
+#define BIT28   0x10000000
+#define BIT29   0x20000000
+#define BIT30   0x40000000
+#define BIT31   0x80000000
+
+#define nGRFWIB             32      // GRF register width in byte
+#define nGRFWIW             16      // GRF register width in word
+#define nGRFWID             8       // GRF register width in dword
+
+#define nTOP_FIELD          0
+#define nBOTTOM_FIELD       1
+
+#define nPREVIOUS_FRAME     0       // Previous frame
+#define nCURRENT_FRAME      1       // Current frame
+#define nNEXT_FRAME         2       // Next frame
+
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+
+//===================================
+
+//========== Macros ==========
+#define REGION(Width,HStride) <Width*HStride;Width,HStride> // Region definition when ExecSize = Width
+
+#define RegFile(a) a
+#define REG(r,n) _REG(RegFile(r),n)
+#define _REG(r,n) __REG(r,n)
+#define __REG(r,n) r##n.0
+#define REG2(r,n,s) _REG2(RegFile(r),n,s)
+#define _REG2(r,n,s) __REG2(r,n,s)
+#define __REG2(r,n,s) r##n.##s
+
+#define dNULLREG     null<1>:d
+#define wNULLREG     null<1>:w
+    
+#define KERNEL_ID(kernel_ID)    mov NULLREG kernel_ID:ud
+
+
+#define NODDCLR 			
+#define NODDCLR_NODDCHK 	
+#define NODDCHK			    
+
+//#define NODDCLR 			{ NoDDClr }
+//#define NODDCLR_NODDCHK 	{ NoDDClr, NoDDChk }
+//#define NODDCHK				{ NoDDChk } 
+
+
+//========== Defines ====================
+
+
+//========== GRF partition ==========
+// r0 header            :   r0          (1 GRF)
+// Static parameters    :   r1 - r5     (5 GRFS)
+// Inline parameters    :   r6 - r7     (2 GRFs)
+// MSGSRC               :   r9          (1 GRF)
+// Top IO region        :   r10 - r33   (24 GRFS 8 for each component Y,U,V 16X8:w)
+// Free space           :   r34 - r55   (22 GRFS)
+// Bottom IO region     :   r56 - r79   (24 GRFS 8 for each component Y,U,V 16X8:w) 
+//===================================
+
+
+//========== Static Parameters ==========
+// r1
+#define fPROCAMP_C0             r1.0    // DWORD 0, Procamp constant C0 in :f
+#define wPROCAMP_C0             r1.0    // DWORD 0, Procamp constant C0 in :w
+#define NUMBER_0002							r1.1		// DWORD 0, 0x0002 used in procamp for GT
+#define udCP_MessageFormat      r1.0    // DWORD 0, bits 2:3 of DWORD. (CE)
+#define udCP_StatePointer       r1.0    // DWORD 0, bits 31:5 of DWORD.(CE)
+
+#define ubSRC_CF_OFFSET         r1.4    // DWORD 1, byte 0-2. SRC packed color format YUV offset in :ub
+
+#define ubDEST_RGB_FORMAT        r1.8    // DWORD 2, byte 0. Dest RGB color format (0:ARGB FF:XRGB)
+#define ubDEST_CF_OFFSET        r1.8    // DWORD 2, byte 0-2. Dest packed color format YUV offset in :ub
+
+#define fPROCAMP_C1             r1.3    // DWORD 3, Procamp constant C1 in :f   
+#define wPROCAMP_C1             r1.6    // DWORD 3, Procamp constant C1 in :w   
+#define NUMBER_0100							r1.7		// DWORD 3, 0x0100 used in procamp for GT
+
+#define fPROCAMP_C2             r1.4    // DWORD 4, Procamp constant C2 in :f
+#define wPROCAMP_C2             r1.8    // DWORD 4, Procamp constant C2 in :w
+
+#define uwSPITCH_DIV2           r1.10   // DWORD 5, byte 0-1. statistics surface pitch divided by 2
+
+#define fVIDEO_STEP_Y           r1.6    // DWORD 6, :f, AVS normalized reciprocal of Y Scaling factor
+#define ubSTMM_SHIFT            r1.24   // DWORD 6, byte 0. Amount of right shift for the DI blending equation
+#define ubSTMM_MIN              r1.25   // DWORD 6, byte 1. Min STMM for DI blending equation
+#define ubSTMM_MAX              r1.26   // DWORD 6, byte 2. Max STMM for DI blending equation
+#define ubTFLD_FIRST            r1.27   // DWORD 6, byte 3. Field parity order
+
+#define fPROCAMP_C5             r1.7    // DWORD 7, Procamp constant C3 in :f
+#define wPROCAMP_C5             r1.14   // DWORD 7, Procamp constant C3 in :w
+
+// r2
+#define fPROCAMP_C3             r2.0    // DWORD 0, Procamp constant C4 in :f
+#define wPROCAMP_C3             r2.0    // DWORD 0, Procamp constant C4 in :w
+                    
+#define fCSC_C5					r2.2	// DWORD 2. WG+CSC constant C5
+#define wCSC_C5					r2.4	// DWORD 2. WG+CSC constant C5
+
+#define fPROCAMP_C4             r2.3    // DWORD 3, Procamp constant C5 in :f
+#define wPROCAMP_C4             r2.6    // DWORD 3, Procamp constant C5 in :w
+
+#define fCSC_C8					r2.4	// DWORD 4. WG+CSC constant C8
+#define wCSC_C8					r2.8	// DWORD 4. WG+CSC constant C8
+#define fCSC_C9					r2.7	// DWORD 7. WG+CSC constant C9
+#define wCSC_C9					r2.14	// DWORD 7. WG+CSC constant C9
+
+// r3
+#define fCSC_C0					r3.0	// DWORD 0. WG+CSC constant C0
+#define wCSC_C0					r3.0	// DWORD 0. WG+CSC constant C0
+
+#define fSCALING_STEP_RATIO     r3.1    // DWORD 1, = Alpha_X_Scaling_Step / Video_X_scaling_Step :f (blending)
+#define fALPHA_STEP_X           r3.1    // DWORD 1, = 1/Scale X, 0.5 = 2x, in :f (blending)
+
+#define fALPHA_STEP_Y           r3.2    // DWORD 2, = 1/Scale Y, in :f
+
+#define fCSC_C4					r3.3	// DWORD 3. WG+CSC constant C4
+#define wCSC_C4					r3.6	// DWORD 3. WG+CSC constant C4
+#define fCSC_C1					r3.4	// DWORD 4. WG+CSC constant C1
+#define wCSC_C1					r3.8	// DWORD 4. WG+CSC constant C1
+
+#define wSRC_H_ORI_OFFSET       r3.10   // DWORD 5, bytes 0,1 :w    
+#define wSRC_V_ORI_OFFSET       r3.11   // DWORD 5, bytes 2,3 :w
+
+#define dCOLOR_PIXEL            r3.6    // DWORD 6. Color pixel for Colorfill
+
+#define fCSC_C2					r3.6	// DWORD 6. WG+CSC constant C2
+#define wCSC_C2					r3.12	// DWORD 6. WG+CSC constant C2
+#define fCSC_C3					r3.7	// DWORD 7. WG+CSC constant C3
+#define wCSC_C3					r3.14	// DWORD 7. WG+CSC constant C3
+
+// r4
+#define fCSC_C6					r4.0	// DWORD 0. WG+CSC constant C6
+#define wCSC_C6					r4.0	// DWORD 0. WG+CSC constant C6
+
+#define wFRAME_ENDX             r4.2    // DWORD 1, word 0. Horizontal end = Origin+Width (in pixels)(for multiple blocks)
+#define wNUM_BLKS               r4.3    // DWORD 1, word 1. Number of blocks to process (for multiple blocks)
+
+#define wCOPY_ORIX              r4.5    // DWORD 2, word 1. A copy of X origin (for multiple blocks)
+#define uwNLAS_ENABLE           r4.4    // DWORD 2, bit 15, NLAS enble bit
+
+#define fCSC_C7					r4.3	// DWORD 3. WG+CSC constant C7
+#define wCSC_C7					r4.6	// DWORD 3. WG+CSC constant C7
+#define fCSC_C10				r4.4	// DWORD 4. WG+CSC constant C10
+#define wCSC_C10				r4.8	// DWORD 4. WG+CSC constant C10
+
+#define fFRAME_VID_ORIX         r4.5    // DWORD 5, Frame horizontal origin normalized for scale kernel
+
+#define fFRAME_ALPHA_ORIX       r4.6    // DWORD 6. Normalized alpha horiz origin for the frame
+
+#define fCSC_C11				r4.7	// DWORD 7. WG+CSC constant C11
+#define wCSC_C11				r4.14	// DWORD 7. WG+CSC constant C11
+
+//========================================
+
+//========== Inline parameters ===========
+// r5
+#define wORIX                   r5.0    // DWORD 0, byte 0-1. :w, Destination Block Horizontal Origin in pel
+#define wORIY                   r5.1    // DWORD 0, byte 2-3. :w, Destination Block Vertical   Origin in pel
+
+#define fSRC_VID_H_ORI          r5.1    // DWORD 1, :f, SRC Y horizontal origin normalized for scale kernel
+
+#define fSRC_VID_V_ORI          r5.2    // DWORD 2, :f, SRC Y vertical origin normalized for scale kernel
+
+#define fSRC_ALPHA_H_ORI        r5.3    // DWORD 3, :f, Normalized alpha horizontal origin
+
+#define fSRC_ALPHA_V_ORI        r5.4    // DWORD 4, :f, Normalized alpha vertical origin
+
+#define uwALPHA_MASK_X          r5.10   // DWORD 5, byte 0-1 :w, H. alpha mask
+#define ubALPHA_MASK_Y          r5.22   // DWORD 5, byte 2.  :ub,V. alpha mask
+#define ubBLK_CNT_X             r5.23   // DWORD 5, byte 3,  :ub, Horizontal Block Count per thread
+
+#define udBLOCK_MASK            r5.6    // DWORD 6
+#define uwBLOCK_MASK_H          r5.12   // DWORD 6, byte 0-1 :uw, Block horizontal mask used in non-DWord aligned kernels
+#define ubBLOCK_MASK_V          r5.26   // DWORD 6, byte 2   :ub, Block vertical mask used in non-DWord aligned kernels
+#define ubNUM_BLKS              r5.27   // DWORD 6, byte 3,  :ub, Total Block Count per thread
+
+#define fVIDEO_STEP_X           r5.7    // DWORD 7. :f, AVS normalized reciprocal of X Scaling factor
+
+// r6
+#define fVIDEO_STEP_DELTA       r6.0    // DWORD 0. :f, AVS normalized delta between 2 adjacent scaling steps (used for non-linear scaling)
+
+
+//====================== Binding table =========================================
+
+#if defined(DNDI)
+    // DNDI Surface Binding Table
+    //#define nBI_SRC_CURR        0       // Current input frame surface
+    //#define nBI_SRC_PRIV        1       // Denoised previous input frame surface
+    //#define nBI_SRC_STAT        2       // Statistics input surface (STMM / Noise motion history)
+    //#define nBI_DEST_1ST        3       // 1st deinterlaced output frame surface
+//    #define nBI_DEST_YUV        3       // Dest frame YUV (for DN only)
+    //#define nBI_DEST_Y          3       // Dest frame Y (for DN only)
+    //#define nBI_DEST_2ND        4       // 2nd deinterlaced output frame surface
+    //#define nBI_DEST_DN_CURR    6       // Denoised current output frame surface
+    //#define nBI_DEST_STAT       7       // Statistics output surface (STMM / Noise motion history)
+//    #define nBI_DEST_U          8       // Dest frame U (for DN only)
+//    #define nBI_DEST_V          9       // Dest frame V (for DN only)
+//    #define nBI_SRC_U          10       // Src frame U (for DN only)
+//    #define nBI_SRC_V          11       // Src frame V (for DN only)
+//    #define nBI_SRC_UV         10       // Current src frame for UV
+    
+#endif
+
+#if defined(INPUT_PL3)
+    // PL3 Surface Binding Table
+//    #define nBI_SRC_ALPHA       0       // Alpha
+//    #define nBI_SRC_Y           1       // Current src frame
+//    #define nBI_SRC_U           2       // Current src frame
+//    #define nBI_SRC_V           3       // Current src frame
+//    #define nBI_DEST_Y         10       // Dest frame
+//    #define nBI_DEST_U         11       // Dest frame
+//    #define nBI_DEST_V         12       // Dest frame
+//    #define nBI_DEST_YUV        7       // Dest frame
+//    #define nBI_DEST_RGB        7       // same num as BI_DEST_YUV, never used at the same time
+#endif
+
+#if defined(INPUT_PL2)
+    // PL2 Surface Binding Table
+//    #define nBI_SRC_ALPHA       0       // Alpha
+//    #define nBI_SRC_Y           1       // Current src frame for Y + offseted UV
+//    #define nBI_SRC_YUV         1       // Current src frame for YUV in case of NV12_AVS
+//    #define nBI_SRC_UV          2       // Current src frame for UV
+//    #define nBI_DEST_YUV        7       // Current dest frame for Y + offseted UV
+//    #define nBI_DEST_RGB        7       // same num as BI_DEST_YUV, never used at the same time
+//    #define nBI_DEST_Y         10       // Dest frame
+//    #define nBI_DEST_U         11       // Dest frame
+//    #define nBI_DEST_V         12       // Dest frame
+#endif
+
+#if defined(INPUT_PA) || defined(COLORFILL)
+    // Packed Surface Binding Table 
+//    #define nBI_SRC_ALPHA       0       // Alpha    
+//    #define nBI_SRC_YUV         1       // Current src frame
+//    #define nBI_DEST_YUV        3       // Dest frame
+//    #define nBI_DEST_RGB        3       // same num as BI_DEST_YUV, never used at the same time
+#endif
+
+
+//supper binding table
+#define nBI_ALPHA_SRC                   0
+#define nBI_CURRENT_SRC_YUV             1 
+#define nBI_FIELD_COPY_SRC_1_YUV        1 
+#define nBI_CURRENT_SRC_Y               1 
+#define nBI_FIELD_COPY_SRC_1_Y          1
+#define nBI_CURRENT_SRC_RGB             1  
+#define nBI_CURRENT_SRC_UV              2 
+#define nBI_FIELD_COPY_SRC_1_UV         2 
+#define nBI_CURRENT_SRC_U               2 
+#define nBI_FIELD_COPY_SRC_1_U          2 
+#define nBI_CURRENT_SRC_V               3
+#define nBI_FIELD_COPY_SRC_1_V          3 
+#define nBI_TEMPORAL_REFERENCE_YUV      4 
+#define nBI_FIELD_COPY_SRC_2_YUV        4 
+#define nBI_TEMPORAL_REFERENCE_Y        4 
+#define nBI_FIELD_COPY_SRC_2_Y          4 
+#define nBI_CURRENT_SRC_YUV_HW_DI       4 
+#define nBI_TEMPORAL_REFERENCE_UV       5 
+#define nBI_FIELD_COPY_SRC_2_UV         5 
+#define nBI_TEMPORAL_REFERENCE_U        5 
+#define nBI_FIELD_COPY_SRC_2_U          5 
+#define nBI_DENOISED_PREV_HW_DI         5 
+#define nBI_TEMPORAL_REFERENCE_V        6 
+#define nBI_FIELD_COPY_SRC_2_V          6 
+#define nBI_STMM_HISTORY                6 
+#define nBI_DESTINATION_YUV             7
+#define nBI_DESTINATION_RGB             7
+#define nBI_DESTINATION_Y               7
+#define nBI_DESTINATION_UV              8
+#define nBI_DESTINATION_U               8
+#define nBI_DESTINATION_V               9
+#define nBI_DESTINATION_1_YUV           10
+#define nBI_DESTINATION_1_Y             10
+#define nBI_DESTINATION_1_UV            11
+#define nBI_DESTINATION_1_U             11
+#define nBI_DESTINATION_1_V             12
+#define nBI_DESTINATION_2_YUV           13
+#define nBI_DESTINATION_2_Y             13
+#define nBI_DESTINATION_2_UV            14
+#define nBI_DESTINATION_2_U             14
+#define nBI_DESTINATION_2_V             15
+#define nBI_STMM_HISTORY_OUTPUT         20
+#define nBI_TEMPORAL_REFERENCE_YUV_PDI  21 
+#define nBI_TEMPORAL_REFERENCE_Y_PDI    21 
+#define nBI_TEMPORAL_REFERENCE_UV_PDI   22 
+#define nBI_TEMPORAL_REFERENCE_U_PDI    22 
+#define nBI_TEMPORAL_REFERENCE_V_PDI    23 
+#define nBI_SUBVIDEO_YUV                26
+#define nBI_SUBVIDEO_Y                  26
+#define nBI_SUBVIDEO_UV                 27
+#define nBI_SUBVIDEO_U                  27
+#define nBI_SUBVIDEO_V                  28
+#define nBI_SUBPICTURE_YUV              29
+#define nBI_SUBPICTURE_P8               29
+#define nBI_SUBPICTURE_A8               30
+#define nBI_GRAPHIC_YUV                 31
+#define nBI_GRAPHIC_P8                  31
+#define nBI_GRAPHIC_A8                  32
+
+
+
+//========== Planar Sampler State Table Index ==========
+#define nSI_SRC_ALPHA           0x000   // Sampler State for Alpha
+
+//Sampler Index for AVS/IEF messages
+#define nSI_SRC_Y               0x400   // Sampler State for Y
+#define nSI_SRC_U               0x800   // Sampler State for U
+#define nSI_SRC_V               0xC00   // Sampler State for V
+#define nSI_SRC_UV              0x800   // For NV12 surfaces
+#define nSI_SRC_YUV             0x400   // For Packed surfaces  
+#define nSI_SRC_RGB             0x400   // For ARGB surfaces
+
+//Sampler Index for SIMD16 sampler messages
+#define nSI_SRC_SIMD16_Y        0x100   // Sampler State for Y
+#define nSI_SRC_SIMD16_U        0x200   // Sampler State for U
+#define nSI_SRC_SIMD16_V        0x300   // Sampler State for V
+#define nSI_SRC_SIMD16_UV       0x200   // For NV12 surfaces
+#define nSI_SRC_SIMD16_YUV      0x100   // For Packed surfaces  
+#define nSI_SRC_SIMD16_RGB      0x100   // For ARGB surfaces
+
+
+
+// Common Registers
+#define pCF_Y_OFFSET            a0.4    // Address register holding Y offset
+#define pCF_U_OFFSET            a0.5    // Address register holding U offset
+#define pCF_V_OFFSET            a0.6    // Address register holding V offset
+
+// #define YUV_ORI             ORIX    // Used by writing packed data to dport
+
+
+//================= Message Payload Header fields ==============================
+#define IDP     r0.2:ud     // Interface Descriptor Pointer
+
+//================= Common Message Descriptor  TBD add common load and save =====
+// Message descriptor for dataport media write
+#ifdef GT
+        // Message Descriptors
+                //                = 000 0001 (min message len 1 - add later) 00000 (resp len 0)         
+                //                  1 (header present 1) 0 0 1010 (media block write) 00000
+                //                  00000000 (binding table index - set later)
+                //                = 0x02094000
+        #define nDPMW_MSGDSC      0x02094000
+        #define nDPMR_MSGDSC      0x02098000  // Data Port Media Block Read Message Descriptor
+        // TBD
+#else // ILK
+        // Message Descriptors
+                //                = 000 0001 (min message len 1 - add later) 00000 (resp len 0)         
+                //                  1 (header present 1) 000 0 010 (media block write) 0000
+                //                  00000000 (binding table index - set later)
+                //                = 0x02082000
+        #define nDPMW_MSGDSC      0x02082000  // Data Port Media Block Write Message Descriptor
+        #define nDPMR_MSGDSC      0x0208A000  // Data Port Media Block Read Message Descriptor
+#endif
+
+// Message Length defines
+#define nMSGLEN_1      0x02000000 // Message Length of 1 GRF for Send
+#define nMSGLEN_2      0x04000000 // Message Length of 2 GRF for Send
+#define nMSGLEN_4      0x08000000 // Message Length of 4 GRF for Send
+#define nMSGLEN_8      0x10000000 // Message Length of 8 GRF for Send
+
+// Response Length defines
+#define nRESLEN_1      0x00100000 // Message Response Length of 1  GRF from Send
+#define nRESLEN_2      0x00200000 // Message Response Length of 2  GRF from Send
+#define nRESLEN_3      0x00300000 // Message Response Length of 3  GRF from Send
+#define nRESLEN_4      0x00400000 // Message Response Length of 4  GRF from Send
+#define nRESLEN_5      0x00500000 // Message Response Length of 5  GRF from Send
+#define nRESLEN_8      0x00800000 // Message Response Length of 8  GRF from Send
+#define nRESLEN_9      0x00900000 // Message Response Length of 9  GRF from Send
+#define nRESLEN_11     0x00B00000 // Message Response Length of 11 GRF from Send
+#define nRESLEN_12     0x00C00000 // Message Response Length of 12 GRF from Send
+#define nRESLEN_16     0x01000000 // Message Response Length of 16 GRF from Send
+
+// Block Width and Height Size defines
+#define nBLOCK_WIDTH_4   0x00000003      // Block Width  4
+#define nBLOCK_WIDTH_5   0x00000004      // Block Width  5
+#define nBLOCK_WIDTH_8   0x00000007      // Block Width  8
+#define nBLOCK_WIDTH_9   0x00000008      // Block Width  9
+#define nBLOCK_WIDTH_12  0x0000000B      // Block Width  12
+#define nBLOCK_WIDTH_16  0x0000000F      // Block Width  16
+#define nBLOCK_WIDTH_20  0x00000013      // Block Width  20
+#define nBLOCK_WIDTH_32  0x0000001F      // Block Width  32
+#define nBLOCK_HEIGHT_1  0x00000000      // Block Height 1
+#define nBLOCK_HEIGHT_2  0x00010000      // Block Height 2
+#define nBLOCK_HEIGHT_4  0x00030000      // Block Height 4
+#define nBLOCK_HEIGHT_5  0x00040000      // Block Height 5
+#define nBLOCK_HEIGHT_8  0x00070000      // Block Height 8
+
+// Extended Message Descriptors
+#define nEXTENDED_MATH      0x1
+#define nSMPL_ENGINE        0x2
+#define nMESSAGE_GATEWAY    0x3
+#define nDATAPORT_READ      0x4
+#define nDATAPORT_WRITE     0x5
+#define nURB                0x6
+#define nTS_EOT             0x27    // with End-Of-Thread bit ON
+
+// Common message descriptors:
+#ifdef GT
+	#define nEOT_MSGDSC       0x02000010  // End of Thread Message Descriptor    
+	#define IF_NULL						null:uw null:uw null:uw 	//for different if instructions on ILK and Gen6
+#else //ILK
+	#define nEOT_MSGDSC       0x02000000  // End of Thread Message Descriptor        
+	#define IF_NULL
+#endif    
+
+
+//===================== Math Function Control ===================================
+#define mfcINV                  0x1     // reciprocal
+#define mfcLOG                  0x2     // log
+#define mfcEXP                  0x3     // exponent
+#define mfcSQRT                 0x4     // square root
+#define mfcRSQ                  0x5     // reciprocal square root
+#define mfcSIN                  0x6     // sine (in radians)
+#define mfcCOS                  0x7     // cosine (in radians)
+#define mfcSINCOS               0x8     // dst0 = sin of src0, dst1 = cosine of src0 (in radians) - GT+ ONLY
+#define mfcPOW                  0xA     // abs(src0) raised to the src1 power    
+#define mfcINT_DIV_QR           0xB     // return quotient and remainder
+#define mfcINT_DIV_Q            0xC     // return quotient
+#define mfcINT_DIV_R            0xD     // return remainder
+
+
+//=================== Message related registers =================================
+
+#ifdef GT
+        #define udDUMMY_NULL
+#else   // _ILK
+        #define udDUMMY_NULL    null:ud         // Used in send inst as src0
+#endif
+
+ 
+//----------- Message Registers ------------
+#define mMSGHDR      m1     // Message Payload Header
+#define mMSGHDRY     m1     // Message Payload Header register for Y data
+#define mMSGHDRU     m2     // Message Payload Header register for U data
+#define mMSGHDRV     m3     // Message Payload Header register for V data
+#define mMSGHDRYA    m4     // Second Message Payload Header register for Y data
+#define mMSGHDRH     m5     // Message Payload Header register for motion history
+#define mMSGHDRY1    m1     // Message Payload Header register for first  Y data
+#define mMSGHDRY2    m2     // Message Payload Header register for second Y data
+#define mMSGHDRY3    m3     // Message Payload Header register for third  Y data
+#define mMSGHDRY4    m4     // Message Payload Header register for fourth Y data
+#define mMSGHDRY5    m5     // Message Payload Header register for fifth Y data
+#define mMSGHDRY6    m6     // Message Payload Header register for sixth Y data
+#define mMSGHDR_EOT  m15    // Dummy Message Register for EOT
+
+#define rMSGSRC     r8      // Message source register
+#define pMSGDSC     a0.0:ud // Message Descriptor register (type DWORD)
+
+#define udMH_ORI    rMSGSRC.0   // Data Port Media Block R/W message header block offset
+#define udMH_ORIX   rMSGSRC.0   // Data Port Media Block R/W message header X offset
+#define udMH_ORIY   rMSGSRC.1   // Data Port Media Block R/W message header Y offset
+#define udMH_SIZE   rMSGSRC.2   // Data Port Media Block R/W message header block width & height
+
+//  M2 - M9 for message data payload
+.declare    mubMSGPAYLOAD  Base=m2 ElementSize=1 SrcRegion=REGION(16,1) Type=ub
+.declare    muwMSGPAYLOAD  Base=m2 ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare    mudMSGPAYLOAD  Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare    mfMSGPAYLOAD   Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=f
+
+//=================== End of thread instruction ===========================
+#ifdef GT
+	#define END_THREAD          mov  (8) mMSGHDR_EOT<1>:ud    r0.0<8;8,1>:ud \n\
+								send (1) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC 
+#else   // ILK				This should be changed to 1 instruction; I have tested it and it works - vK
+	#define END_THREAD          mov  (8) mMSGHDR_EOT<1>:ud    r0.0<8;8,1>:ud \n\
+                            	send (1) dNULLREG mMSGHDR_EOT udDUMMY_NULL  nTS_EOT nEOT_MSGDSC:ud
+#endif
+
+
+//=======================================================================
+// Region declarations for SRC and DEST as TOP and BOT
+
+// Common I/O regions
+#define nREGION_1       1
+#define nREGION_2       2
+
+//*** These region base GRFs are fixed regardless planar/packed, and data alignment.
+//*** Each kernel is responsible to select the correct region declaration below.
+//*** YUV regions are not necessarily next to each other.
+#define nTOP_Y          10      // r10 - r17  (8 GRFs)
+#define nTOP_U          18      // r18 - r25 (8 GRFs)
+#define nTOP_V          26      // r26 - r33 (8 GRFs)
+
+#define nBOT_Y          56      // r56 - r63 (8 GRFs)
+#define nBOT_U          64      // r64 - r71 (8 GRFs)
+#define nBOT_V          72      // r72 - r79 (8 GRFs)
+
+// Define temp space for any usages
+#define nTEMP0          34
+#define nTEMP1          35
+#define nTEMP2          36
+#define nTEMP3          37
+#define nTEMP4          38
+#define nTEMP5          39
+#define nTEMP6          40
+#define nTEMP7          41
+#define nTEMP8          42
+#define nTEMP10         44
+#define nTEMP12         46
+#define nTEMP14         48
+#define nTEMP16         50
+#define nTEMP17         51
+#define nTEMP18         52
+
+#define nTEMP24			58
+
+// Common region 1
+.declare ubTOP_Y        Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
+.declare ubTOP_U        Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
+.declare ubTOP_V        Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
+                        
+.declare uwTOP_Y        Base=REG(r,nTOP_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw
+.declare uwTOP_U        Base=REG(r,nTOP_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare uwTOP_V        Base=REG(r,nTOP_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare ub2TOP_Y       Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub
+.declare ub2TOP_U       Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
+.declare ub2TOP_V       Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
+
+.declare ub4TOP_Y       Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare ub4TOP_U       Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare ub4TOP_V       Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+
+.declare ubTOP_ARGB     Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+
+// Used by "send" instruction
+.declare udTOP_Y_IO     Base=REG(r,nTOP_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare udTOP_U_IO     Base=REG(r,nTOP_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare udTOP_V_IO     Base=REG(r,nTOP_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+
+// Common region 2
+.declare ubBOT_Y        Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
+.declare ubBOT_U        Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
+.declare ubBOT_V        Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
+                        
+.declare uwBOT_Y        Base=REG(r,nBOT_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw
+.declare uwBOT_U        Base=REG(r,nBOT_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare uwBOT_V        Base=REG(r,nBOT_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare ub2BOT_Y       Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub
+.declare ub2BOT_U       Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
+.declare ub2BOT_V       Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
+
+.declare ubBOT_ARGB     Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+
+// Used by "send" instruction
+.declare udBOT_Y_IO     Base=REG(r,nBOT_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare udBOT_U_IO     Base=REG(r,nBOT_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare udBOT_V_IO     Base=REG(r,nBOT_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+
+// End of common.inc
+
+#endif    // COMMON_INC
diff --git a/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm b/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm
new file mode 100644
index 0000000..36c4be6
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm
@@ -0,0 +1,55 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: readSampler16x1.asm
+//
+// Read one row of pix through sampler
+//
+
+
+
+//#define SAMPLER_MSG_DSC		0x166A0000	// ILK Sampler Message Descriptor
+
+
+
+// Send Message [DevILK]                                Message Descriptor
+//  MBZ MsgL=5 MsgR=8                            H MBZ   SIMD     MsgType   SmplrIndx BindTab
+//  000 0 101 0 1000                             1  0     10     0000         0000    00000000
+//    0     A    8                                     A             0             0     0     0
+
+//     MsgL=1+2*2(u,v)=5 MsgR=8
+ 
+#define SAMPLER_MSG_DSC		0x0A8A0000	// ILK Sampler Message Descriptor
+
+
+
+
+
+                                                                                
+
+	// Assume MSGSRC is set already in the caller
+        //mov (8)		rMSGSRC.0<1>:ud			0:ud	// Unused fileds
+
+
+
+	// Read 16 sampled pixels and stored them in float32 in 8 GRFs
+	// 422 data is expanded to 444, return 8 GRF in the order of RGB- (UYV-).
+	// 420 data has three surfaces, return 8 GRF. Valid is always in the 1st GRF when in R8.  Make sure no overwrite the following 3 GRFs.
+	// alpha data is expanded to 4444, return 8 GRF in the order of RGBA (UYVA).
+
+    mov(16)     mMSGHDR<1>:uw   rMSGSRC<16;16,1>:uw
+    send (16)	DATABUF(0)<1>	mMSGHDR		udDUMMY_NULL	0x2 SAMPLER_MSG_DSC+SAMPLER_IDX+BINDING_IDX:ud
+
+
+
+
+    
+
+
diff --git a/i965_drv_video/shaders/post_processing/Common/undefall.inc b/i965_drv_video/shaders/post_processing/Common/undefall.inc
new file mode 100644
index 0000000..241bd70
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Common/undefall.inc
@@ -0,0 +1,65 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Modual name: undefall.inc
+//
+// undefine all global symbol for new process
+//
+
+//Source definitions
+#undef  ubSRC_Y   
+#undef  ubSRC_U   
+#undef  ubSRC_V 
+
+#undef  ub2SRC_Y   
+#undef  ub2SRC_U   
+#undef  ub2SRC_V
+
+#undef  ub4SRC_Y   
+#undef  ub4SRC_U   
+#undef  ub4SRC_V
+
+#undef  uwSRC_Y   
+#undef  uwSRC_U   
+#undef  uwSRC_V
+
+#undef  udSRC_Y   
+#undef  udSRC_U   
+#undef  udSRC_V
+
+#undef  udSRC_YUV
+#undef  nSRC_YUV_REG
+
+//Destination definitions
+#undef  ubDEST_Y   
+#undef  ubDEST_U   
+#undef  ubDEST_V 
+
+#undef  ub2DEST_Y   
+#undef  ub2DEST_U   
+#undef  ub2DEST_V
+
+#undef  ub4DEST_Y   
+#undef  ub4DEST_U   
+#undef  ub4DEST_V
+
+#undef  uwDEST_Y   
+#undef  uwDEST_U   
+#undef  uwDEST_V
+
+#undef  udDEST_Y   
+#undef  udDEST_U   
+#undef  udDEST_V
+
+#undef  udDEST_YUV
+#undef  nDEST_YUV_REG
+#undef  ubDEST_ARGB
+
+// End of undefall.inc
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc
new file mode 100644
index 0000000..cbed61a
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc
@@ -0,0 +1,108 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: AVS_IEF.inc
+
+#ifndef _AVS_INF_INC_
+#define _AVS_INF_INC_
+
+#include "undefall.inc"             //Undefine the SRC and DEST sysmbols
+
+        // Message Header
+        // m0.7         31:0    Debug
+        // m0.6         31:0    Debug
+        // m0.5         31:0    Ignored
+        // m0.4         31:0    Ignored
+        // m0.3         31:0    Ignored
+        // m0.2         31:16   Ignored
+        //              15      Alpha Write Channel Mask        enable=0, disable=1
+        //              14      Blue Write Channel Mask  (V)    
+        //              13      Green Write Channel Mask (Y)
+        //              12      Red Write Channel Mask   (U)
+        //              11:0    Ignored
+        // m0.1                 Ignored
+        // m0.0                 Ignored
+
+#define mAVS_8x8_HDR   m0               // Message Header
+#define mAVS_PAYLOAD   m1               // Message Payload Header
+
+#define mAVS_8x8_HDR_2   m2               // Message Header
+#define mAVS_PAYLOAD_2   m3               // Message Payload Header
+
+#define mAVS_8x8_HDR_UV   m2               // Message Header
+#define mAVS_PAYLOAD_UV   m3               // Message Payload Header
+
+#define rAVS_8x8_HDR   rMSGSRC          // Mirror of Message Header 
+#define rAVS_PAYLOAD   r9               // Mirror of Message Payload Header
+        
+        // AVS payload
+        // m1.7                 Ignored
+        // m1.6                 Pixel 0 V Address       ---> ORIY (Y0)
+        // m1.5                 Delta V                 ---> Step Y
+        // m1.4                 Ignored
+        // m1.3                 Ignored
+        // m1.2                 Pixel 0 U Address       ---> ORIX (X0)
+        // m1.1                 U 2nd Derivative        ---> NLAS dx 
+        // m1.0                 Delta U                 ---> Step X
+
+        // Sampler Message Descriptor
+        // 31:29        Reserved                        000
+        // 28:25        Message length                  0010
+        // 24:20        Response length                 xxxxx   ---> 4GRFs for each enabled channel
+        // 19           Header Present                  1
+        // 18           MBZ                             0
+        // 17:16        SIMD Mode                       11      ---> SIMD64
+        // 15:12        Message Type                    0011    ---> sample_8x8
+        // 11:8         Sampler Index                   xxxx
+        // 7:0          Binding Table Index             xxxxxxxx
+#define nAVS_MSG_DSC_1CH        0x044BB000  
+#define nAVS_MSG_DSC_2CH        0x048BB000
+#define nAVS_MSG_DSC_3CH        0x04CBB000      
+#define nAVS_MSG_DSC_4CH        0x050BB000 
+
+#define nAVS_RED_CHANNEL_ONLY   0x0000E000      // Enable Red channel only
+#define nAVS_GREEN_CHANNEL_ONLY 0x0000D000      // Enable Green channel only
+#define nAVS_RED_BLUE_CHANNELS  0x0000A000      // Enable Red and Blue channels
+#define nAVS_RGB_CHANNELS       0x00008000      // Enable RGB(YUV) channels
+#define nAVS_ALL_CHANNELS       0x00000000      // Enable all channels (ARGB\AYUV)
+
+        
+
+.declare     ubAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE  Base=REG(r,nTEMP8) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+
+.declare     ubAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=1  SrcRegion=REGION(16,1) Type=ub
+.declare     uwAVS_RESPONSE_2  Base=REG(r,nTEMP24) ElementSize=2  SrcRegion=REGION(16,1) Type=uw
+
+
+#if (nSRC_REGION==nREGION_2)
+    #define uwDEST_Y        uwBOT_Y
+    #define uwDEST_U        uwBOT_U
+    #define uwDEST_V        uwBOT_V
+
+    #define ubDEST_Y        ubBOT_Y
+    
+    #undef  nSRC_REGION
+    #define nSRC_REGION nREGION_2
+
+#else //(nSRC_REGION==nREGION_1)
+    #define uwDEST_Y        uwTOP_Y
+    #define uwDEST_U        uwTOP_U
+    #define uwDEST_V        uwTOP_V
+
+    #define ubDEST_Y        ubTOP_Y
+    
+    #undef  nSRC_REGION
+    #define nSRC_REGION     nREGION_1
+
+#endif
+
+
+#endif //_AVS_INF_INC_
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm
new file mode 100644
index 0000000..d45ce44
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm
@@ -0,0 +1,35 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//------------------------------------------------------------------------------
+// AVS_SetupFirstBlock.asm
+//------------------------------------------------------------------------------
+        
+    // Setup Message Header
+//    mov (8) mAVS_8x8_HDR<1>:ud      rMSGSRC<8;8,1>:ud                                                     
+
+    // Check  NLAS Enable bit
+    and.z.f0.0	(1)	wNULLREG                uwNLAS_ENABLE:uw	BIT15:uw	
+    (f0.0)mov   (1) fVIDEO_STEP_DELTA:f     0.0:f   
+    
+    // Setup Message Payload Header for 1st block of Media Sampler 8x8
+    mov (1) rAVS_PAYLOAD.0:f        fVIDEO_STEP_DELTA:f     //NLAS dx
+    mov (1) rAVS_PAYLOAD.1:f        fVIDEO_STEP_X:f         //Step X 
+    mov (1) rAVS_PAYLOAD.5:f        fVIDEO_STEP_Y:f         //Step Y 
+    mov (2) rAVS_PAYLOAD.2<4>:f     fSRC_VID_H_ORI<2;2,1>:f //Orig X and Y 
+
+
+    
+
+
+
+
+
+		
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm
new file mode 100644
index 0000000..8f125dc
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm
@@ -0,0 +1,27 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//------------------------------------------------------------------------------
+// AVS_SetupSecondBlock.asm
+//------------------------------------------------------------------------------
+        
+    //NLAS calculations for 2nd block of Media Sampler 8x8: 
+    // X(i) = X0 + dx*i + ddx*i*(i-1)/2   ==>  X(8) = X0 + dx*8 +ddx*28
+    // dx(i)= dx(0) + ddx*i               ==>  dx(8)= dx + ddx*8
+
+    // Calculating X(8)
+    mov (1)   acc0.2<1>:f           fSRC_VID_H_ORI:f                         
+    mac (1)   acc0.2<1>:f           fVIDEO_STEP_X:f          8.0:f           
+    mac (1)   rAVS_PAYLOAD.2:f      fVIDEO_STEP_DELTA:f      28.0:f                    
+    
+    // Calculating dx(8)
+    mov (1)   acc0.1<1>:f           fVIDEO_STEP_X:f                         
+    mac (1)   rAVS_PAYLOAD.1:f      fVIDEO_STEP_DELTA:f      8.0:f
+		
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc
new file mode 100644
index 0000000..62f84c0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc
@@ -0,0 +1,194 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: DI.inc
+
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+ 
+
+//---------------------------------------------------------------------------
+// Binding table indices
+//---------------------------------------------------------------------------
+#define nBIDX_DI_PRV		10		// Previous DI-ed frame
+#define nBIDX_DI_CUR		13		// Current DI-ed frame
+#define	nBIDX_DN			7		// Denoised frame
+#define	nBIDX_STAT			20		// Statistics 
+#define nBIDX_DI_Source  4  // Source Surface
+
+
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+#define nSMPL_ENGINE		0x2
+#define nDATAPORT_WRITE		0x5
+#define nTS_EOT				0x27	// with End-Of-Thread bit ON
+
+		// Message descriptor for end-of-thread
+		//						= 000 0001 (message len) 00000 (resp len)
+		//						  0 (header present 0) 00000000000000 0 (URB dereferenced) 0000
+#define nEOT_MSGDSC			0x02000000
+
+		// Message descriptor for sampler read
+		//						= 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)  
+		//						  1 (header present 1) 0 11 (SIMD32/64 mode) 
+		//						  1000 (message type) 0000 (DI state index) 
+		//						  00000000 (binding table index - set later)
+		//						= 0x040b8000
+
+// comment begin
+// The following is commented out because of walker feature
+// It corresponds to the #ifdef GT #else and #endif
+//#define nSMPL_MSGDSC		    0x040b8000
+//#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+//#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5
+//#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9
+//#define nSMPL_RESP_LEN_NODN	    0x00900000		// 9
+//#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+// comment end
+
+#ifdef GT
+
+#define nSMPL_MSGDSC		    0x040b8000
+#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9  //DI disable, the XY stored in 5th GRF, no impact to return length
+#define nSMPL_RESP_LEN_NODN	    0x00a00000		// 10 //NO DN, originally use 9, now we need use 10 to store the XY with walker
+#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+
+#else
+
+#define nSMPL_MSGDSC		    0x040b8000
+#define nSMPL_RESP_LEN_DI	    0x00c00000		// 12
+#define nSMPL_RESP_LEN_NODI_PL  0x00500000		// 5
+#define nSMPL_RESP_LEN_NODI_PA  0x00900000		// 9
+#define nSMPL_RESP_LEN_NODN	    0x00900000		// 9
+#define nSMPL_RESP_LEN_PDI	    0x00b00000		// 11
+
+#endif
+
+		// Message descriptor for dataport media write
+#ifdef GT
+		//						= 000 0000 (message len - set later) 00000 (resp len 0) 		
+		//						  1 (header present 1) 0 0 1010 (media block write) 00000
+		//						  00000000 (binding table index - set later)
+		//						= 0x00094000
+#define nDPMW_MSGDSC		    0x00094000
+#else // ILK
+		//						= 000 0000 (message len - set later) 00000 (resp len 0) 		
+		//						  1 (header present 1) 000 0 010 (media block write) 0000
+		//						  00000000 (binding table index - set later)
+		//						= 0x00082000
+#define nDPMW_MSGDSC		    0x00082000
+#endif
+#define nDPMW_MSG_LEN_STMM	    0x04000000		// 2 - STMM
+#define nDPMW_MSG_LEN_DH	    0x04000000		// 2 - Denoise history
+#define nDPMW_MSG_LEN_PA_DN	    0x0a000000		// 5 - Denoised output
+#define nDPMW_MSG_LEN_PA_NODI	0x12000000		// 9 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_PL_DN	    0x06000000		// 3 - Denoised output
+#define nDPMW_MSG_LEN_PL_NODI	0x0a000000		// 5 - Denoised output - denoise only - DI disabled
+#define nDPMW_MSG_LEN_DI	    0x0a000000		// 5 - DI output
+
+
+//---------------------------------------------------------------------------
+// Static and inline parameters
+//---------------------------------------------------------------------------
+// Static parameters
+.declare ubTFLD_FIRST		Base=r1.27	ElementSize=1 Type=ub	// top field first
+.declare ubSRCYUVOFFSET		Base=r1.4	ElementSize=1 Type=ub	// source packed format
+.declare ubDSTYUVOFFSET		Base=r1.8	ElementSize=1 Type=ub	// destination packed format
+.declare uwSPITCH_DIV2		Base=r1.10	ElementSize=2 Type=uw	// statistics surface pitch divided by 2
+
+// Inline parameters
+.declare uwXORIGIN			Base=r5.0	ElementSize=2 Type=uw	// X and Y origin
+.declare uwYORIGIN			Base=r5.1	ElementSize=2 Type=uw
+
+
+//---------------------------------------------------------------------------
+// Kernel GRF variables 
+//---------------------------------------------------------------------------
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare dRESP						Base=r8		ElementSize=4 Type=d	// Response message (12 or 5 or 11)
+.declare ubRESP						Base=r8		ElementSize=1 Type=ub	
+
+.declare dSTMM						Base=r16	ElementSize=4 Type=d	// STMM
+.declare ubDN_HIST_NODI		Base=r12	ElementSize=1 Type=ub	// Denoise history data (DI disabled)
+.declare ubDN_HIST_DI			Base=r17	ElementSize=1 Type=ub	// Denoise history data (DI enabled)
+.declare uwRETURNED_POSITION_DI	Base=r17	ElementSize=2 Type=uw	// XY_Return_Data (DI enabled)
+.declare uwRETURNED_POSITION_DN	Base=r12	ElementSize=2 Type=uw // XY_Return_Data (DI disabled)
+
+.declare ub1ST_FLD_DN			Base=r12	ElementSize=1 Type=ub	// 1st field Denoised data (DI enabled)
+.declare d1ST_FLD_DN			Base=r12	ElementSize=4 Type=d
+.declare ub2ND_FLD_DN			Base=r18	ElementSize=1 Type=ub	// 2nd field Denoised data (DI enabled)	
+.declare d2ND_FLD_DN			Base=r18	ElementSize=4 Type=d
+.declare ubPRV_DI					Base=r8		ElementSize=1 Type=ub	// Previous frame DI (DI enabled)
+.declare ubCUR_DI					Base=r12	ElementSize=1 Type=ub	// Previous frame DI (DI enabled)
+
+// Packed denoised output
+.declare ubDN_YUV					Base=r22	ElementSize=1 Type=ub	// Denoised YUV422
+.declare dDN_YUV					Base=r22	ElementSize=4 Type=d
+#define	 npDN_YUV			704									// = 22*32 = 0x280
+
+// Packed DI output
+.declare dDI_YUV_PRV			Base=r32	ElementSize=4 Type=d	// Previous frame DI output
+.declare dDI_YUV_CUR			Base=r36	ElementSize=4 Type=d	// Current frame DI output
+#define	 npDI_YUV			1024									// = 32*32 = 0x 
+
+// For packed output
+#define	 p422_YOFFSET		a0.2	
+#define	 p422_UOFFSET		a0.3	
+#define	 p422_VOFFSET		a0.4
+#define	 pDN_TFLDSRC		a0.6	
+#define	 pDN_BFLDSRC		a0.7	
+#define	 npRESP				192									// = 6*32
+
+// Message source
+.declare udMSGSRC					Base=r70	  ElementSize=4 Type=ud
+.declare uwMSGSRC					Base=r70	  ElementSize=2 Type=uw
+.declare dMSGSRC          Base=r70    ElementSize=4 Type=d
+
+
+//---------------------------------------------------------------------------
+// Kernel MRF variables 
+//---------------------------------------------------------------------------
+#define	mMSGHDR_SMPL		m1									// Sampler response: m1~m2
+.declare mudMSGHDR_SMPL		Base=m1		ElementSize=4 Type=ud
+.declare muwMSGHDR_SMPL		Base=m1		ElementSize=2 Type=uw
+#define	mMSGHDR_DN			m3									// Denoise output: m3~m7 for PA, m3~m5 for PL
+.declare mdMSGHDR_DN		Base=m3		ElementSize=4 Type=d
+#define	mMSGHDR_STAT		m8									// Statistics output: m8~m9
+.declare mdMSGHDR_STAT		Base=m8		ElementSize=4 Type=d
+.declare mubMSGHDR_STAT		Base=m8		ElementSize=1 Type=ub
+#define	mMSGHDR_DI			m10									// DI output: m10~m14
+.declare mdMSGHDR_DI		Base=m10	ElementSize=4 Type=d
+#define	mMSGHDR_EOT			m15									// EOT
+
+#ifdef GT
+#define	MSGSRC
+#else
+#define MSGSRC				null:ud
+#endif
+
+        
+//---------------------------------------------------------------------------
+// End of thread instruction
+//---------------------------------------------------------------------------
+#ifdef GT
+#define END_THREAD			send (8) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC 
+#else	// ILK
+#define END_THREAD			send (8) null<1>:d mMSGHDR_EOT null:ud	nTS_EOT nEOT_MSGDSC
+#endif
+
+
+// end of DI.inc
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm
new file mode 100644
index 0000000..ae8ff85
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm
@@ -0,0 +1,24 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                      NODDCLR           // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw  NODDCLR_NODDCHK  // Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                            NODDCHK           // block width and height (4x2)
+
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header   
+mov (1)    mudMSGHDR_HIST(1)<1>    udRESP(nDI_HIST_OFFSET,0)<0;1,0>    // Move denoise history to MRF
+
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
+
+
+
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm
new file mode 100644
index 0000000..f4e2fe7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm
@@ -0,0 +1,56 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+    shl (1) rMSGSRC.0<1>:ud     wORIX<0;1,0>:w            1:w  NODDCLR             // H. block origin need to be doubled
+    mov (1) rMSGSRC.1<1>:ud     wORIY<0;1,0>:w                 NODDCLR_NODDCHK    // Block origin
+    mov (1) rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DI:ud          NODDCHK             // Block width and height (32x8)
+    
+	
+	add (4) pCF_Y_OFFSET<1>:uw   ubDEST_CF_OFFSET<4;4,1>:ub   nDEST_YUV_REG*nGRFWIB:w    // Initial Y,U,V offset in YUV422 block
+
+	// Pack 2nd field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+		mov     (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2>       ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+	// Pack 1st field Y
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+		mov     (16) r[pCF_Y_OFFSET, %1+4*nGRFWIB]<2>       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+	// Pack 2nd field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+	 // Pack 1st field U
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_U_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    }
+	// Pack 2nd field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1*nGRFWIB]<4>  ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+	// Packs1st field V
+    $for(0; <nUV_NUM_OF_ROWS; 1) {
+        mov (8) r[pCF_V_OFFSET,   %1+4*nGRFWIB]<4>  ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>  //Vpixels
+    }
+
+    //save the previous frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_1_YUV:ud
+
+    //save the current frame
+    mov (8) mMSGHDR<1>:ud       rMSGSRC<8;8,1>:ud
+    $for(0; <4; 1) {
+            mov (8) mudMSGPAYLOAD(%1)<1>  udDEST_YUV(%1+4)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_2_YUV:ud
+	
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc
new file mode 100644
index 0000000..3258756
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc
@@ -0,0 +1,162 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: DI.inc
+
+#ifdef GT
+// GT DI Kernel
+#else // ILK
+// ILK DI Kernel
+#endif
+
+#include "undefall.inc"
+
+//---------------------------------------------------------------------------
+// Message descriptors
+//---------------------------------------------------------------------------
+// Extended message descriptor
+          // Message descriptor for sampler read
+//        //                      = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11)  
+//        //                        1 (header present 1) 0 11 (SIMD32/64 mode) 
+//        //                        1000 (message type) 0000 (DI state index) 
+//        //                        00000000 (binding table index - set later)
+//        //                      = 0x040b8000
+#define nSMPL_DI_MSGDSC           0x040b8000
+
+#define nSMPL_RESP_LEN_DNDI      nRESLEN_12      // 12 - for DN + DI Alg
+#define nSMPL_RESP_LEN_DN_PL     nRESLEN_5       // 5  - for DN Planar Alg
+#define nSMPL_RESP_LEN_DN_PA     nRESLEN_9       // 9  - for DN Packed Alg
+#define nSMPL_RESP_LEN_DI        nRESLEN_9       // 9  - for DI Only Alg
+#define nSMPL_RESP_LEN_PDI       nRESLEN_11      // 11 - for Partial DI Alg
+
+// Attention: The Message Length is The Number of GRFs with Data Only, without the Header
+#define nDPMW_MSG_LEN_STMM       nMSGLEN_1       // 1 - For STMM Save
+#define nDPMW_MSG_LEN_HIST       nMSGLEN_1       // 1 - For Denoise History Save
+#define nDPMW_MSG_LEN_PA_DN_DI   nMSGLEN_4       // 4 - For DN Curr Save
+#define nDPMW_MSG_LEN_PA_DN_NODI nMSGLEN_8       // 8 - For DN Curr Save (denoise only - DI disabled)
+#define nDPMW_MSG_LEN_PL_DN_DI   nMSGLEN_2       // 2 - For DN Curr Save
+#define nDPMW_MSG_LEN_PL_DN_NODI nMSGLEN_4       // 4 - For DN Curr Save (denoise only - DI disabled)
+
+#define nDPW_BLOCK_SIZE_STMM   nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4   // Y block size 8x4
+
+#undef  nDPW_BLOCK_SIZE_DI
+#undef  nDPW_MSG_SIZE_DI
+#define nDPW_BLOCK_SIZE_DI  nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4    
+#define nDPW_MSG_SIZE_DI    nMSGLEN_4
+
+
+//---------------------------------------------------------------------------
+// Kernel GRF variables 
+//---------------------------------------------------------------------------
+// Defines for DI enabled
+#define nDI_PREV_FRAME_LUMA_OFFSET          0
+#define nDI_PREV_FRAME_CHROMA_OFFSET        2
+#define nDI_CURR_FRAME_LUMA_OFFSET          4
+#define nDI_CURR_FRAME_CHROMA_OFFSET        6
+#define nDI_STMM_OFFSET                     8
+#define nDI_HIST_OFFSET                     9
+#define nDI_CURR_2ND_FIELD_LUMA_OFFSET     10
+#define nDI_CURR_2ND_FIELD_CHROMA_OFFSET   11
+
+// Defines for DI disabled
+#define nNODI_LUMA_OFFSET                   0
+#define nNODI_HIST_OFFSET                   4
+#define nNODI_CHROMA_OFFSET                 5
+
+#ifdef DI_ENABLE
+    #define nHIST_OFFSET    nDI_HIST_OFFSET
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+	
+#endif
+
+#ifdef DI_DISABLE
+    #define nHIST_OFFSET    nNODI_HIST_OFFSET
+#endif
+
+#if (nSRC_REGION==nREGION_2)
+    #define ub2SRC_Y      ub2BOT_Y
+    #define ub2SRC_U      ub2BOT_U
+    #define ub2SRC_V      ub2BOT_V
+    #define uwDEST_Y      uwBOT_Y
+    #define uwDEST_U      uwBOT_U
+    #define uwDEST_V      uwBOT_V
+    #define nDEST_YUV_REG nTOP_Y
+    #define udDEST_YUV    udTOP_Y_IO
+
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nTOP_Y         // Space for Packing DN for next run requires 8 GRFs
+
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_2
+
+#else
+    #define ub2SRC_Y      ub2TOP_Y
+    #define ub2SRC_U      ub2TOP_U
+    #define ub2SRC_V      ub2TOP_V
+    #define uwDEST_Y      uwTOP_Y
+    #define uwDEST_U      uwTOP_U
+    #define uwDEST_V      uwTOP_V
+    #define nDEST_YUV_REG nBOT_Y
+    #define udDEST_YUV    udBOT_Y_IO
+    #define nRESP         nTEMP0         // DI return message requires 12 GRFs
+    #define nDN_YUV       nBOT_Y         // Space for Packing DN for next run requires 8 GRFs
+
+    #undef  nSRC_REGION
+    #define nSRC_REGION   nREGION_1    // REGION_1 will be the source region for first kernel
+
+#endif
+
+
+    
+
+
+
+
+
+
+// Message response (Denoised & DI-ed pixels & statistics)
+.declare udRESP      Base=REG(r,nRESP) ElementSize=4 SrcRegion=REGION(8,1) DstRegion=<1> Type=ud
+.declare ubRESP      Base=REG(r,nRESP) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
+
+// For Denoised Curr Output (Used as Priv in Next Run)
+.declare ubDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=1 Type=ub
+.declare udDN_YUV           Base=REG(r,nDN_YUV)    ElementSize=4 Type=ud
+#define  npDN_YUV           nDN_YUV*nGRFWIB                                 
+
+// For DI Process Output (1st and 2nd Frames Output)
+//.declare udDI_YUV_PRIV      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Previous frame DI output
+//.declare udDI_YUV_CURR      Base=REG(r,nTEMP0)    ElementSize=4 Type=ud   // Current frame DI output
+//#define  npDI_YUV           nTEMP0*nGRFWIB                                  
+
+//---------------------------------------------------------------------------
+// Kernel MRF variables 
+//---------------------------------------------------------------------------
+#define  mMSG_SMPL           m1                                              // Sampler Command is in: m1~m2
+.declare mudMSG_SMPL         Base=mMSG_SMPL         ElementSize=4 Type=ud
+.declare muwMSG_SMPL         Base=mMSG_SMPL         ElementSize=2 Type=uw
+
+#define mMSGHDR_DN           m1                                              // Denoise Output: m1~m9 for PA, m3~m5 for PL
+.declare mudMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=4 Type=ud
+.declare mubMSGHDR_DN        Base=mMSGHDR_DN        ElementSize=1 Type=ub
+
+#define mMSGHDR_STMM         m11                                             // STMM Output: m11~m12
+.declare mudMSGHDR_STMM      Base=mMSGHDR_STMM      ElementSize=4 Type=ud
+#define mMSGHDR_HIST         m13                                             // HIST Output: m13~m14
+.declare mudMSGHDR_HIST      Base=mMSGHDR_HIST      ElementSize=1 Type=ud
+
+#define mMSGHDR_DI_1ST       m1                                              // DI output: m1~m5
+.declare mudMSGHDR_DI_1ST    Base=mMSGHDR_DI_1ST    ElementSize=4 Type=ud
+#define mMSGHDR_DI_2ND       m6                                              // DI output: m6~m10
+.declare mudMSGHDR_DI_2ND    Base=mMSGHDR_DI_2ND    ElementSize=4 Type=ud
+
+// end of DNDI.inc
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm
new file mode 100644
index 0000000..2c041fc
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm
@@ -0,0 +1,17 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Activate the DNDI send command
+mov (8)     mudMSG_SMPL(0)<1>        rMSGSRC.0<8;8,1>:ud    NODDCLR         // message header
+mov (1)     muwMSG_SMPL(1,4)<1>      wORIX<0;1,0>:w         NODDCLR_NODDCHK// horizontal origin
+mov (1)     muwMSG_SMPL(1,12)<1>     wORIY<0;1,0>:w         NODDCLR_NODDCHK         // vertical origin
+//mov (2)     muwMSG_SMPL(1,4)<2>      wORIX<2;2,1>:w       NODDCHK// problem during compile !! when using this line
+
+send (8)    udRESP(0)<1>    mMSG_SMPL  udDUMMY_NULL   nSMPL_ENGINE    nSMPL_DI_MSGDSC+nSMPL_RESP_LEN+nBI_CURRENT_SRC_YUV_HW_DI:ud
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm
new file mode 100644
index 0000000..91c5bc2
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm
@@ -0,0 +1,20 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+
+// Write denoise history to memory
+shr (2)    rMSGSRC.0<1>:ud    wORIX<2;2,1>:w            2:w                       NODDCLR         // X,Y origin / 4
+add (1)    rMSGSRC.0<1>:ud    rMSGSRC.0<0;1,0>:ud       uwSPITCH_DIV2<0;1,0>:uw   NODDCLR_NODDCHK// Add pitch to X origin
+mov (1)    rMSGSRC.2<1>:ud    nDPW_BLOCK_SIZE_HIST:ud                             NODDCHK         // block width and height (4x2)
+
+mov (8)    mMSGHDR_HIST<1>:ud      rMSGSRC.0<8;8,1>:ud                   // message header   
+mov (2)    mudMSGHDR_HIST(1)<1>    udRESP(nNODI_HIST_OFFSET,0)<2;2,1>    // Move denoise history to MRF
+
+send (8)   dNULLREG    mMSGHDR_HIST    udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm
new file mode 100644
index 0000000..55f71b5
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm
@@ -0,0 +1,26 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_16x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_16x8.asm"
+
+//------------------------------------------------------------------------------
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm
new file mode 100644
index 0000000..55c201b
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm
@@ -0,0 +1,25 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_8x4.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:0 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x4.asm"
+
+//------------------------------------------------------------------------------
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm
new file mode 100644
index 0000000..6bde8c4
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm
@@ -0,0 +1,25 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_8x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Sample.asm"
+
+//------------------------------------------------------------------------------
+// Unpacking sampler data to 4:2:2 internal planar 
+//------------------------------------------------------------------------------
+#include "PA_AVS_IEF_Unpack_8x8.asm"
+
+//------------------------------------------------------------------------------
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm
new file mode 100644
index 0000000..0b533ef
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm
@@ -0,0 +1,34 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_Sample.asm ----------
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 YUV packed
+//------------------------------------------------------------------------------
+        
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // Enable RGB(YUV) channels
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RGB_CHANNELS:ud   
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    mov (16) mAVS_8x8_HDR_2.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV
+    // Return YUV in 12 GRFs
+        
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm
new file mode 100644
index 0000000..5dcc988
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm
@@ -0,0 +1,288 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_Unpack_16x8.asm ----------
+        
+#ifdef AVS_OUTPUT_16_BIT	//Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(9,12)<4;4,1>                                   
+
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(11,12)<4;4,1>                                   
+
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(6,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(6,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(6,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(6,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(7,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(7,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(7,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(7,12)<4;4,1>                                   
+
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(1,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(4,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(5,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(8,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(9,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(12,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(13,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(16,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(17,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(20,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(21,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(24,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(25,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(28,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(29,3)<4>      0:uw                                   
+
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>                                   
+
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>                                   
+
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(6,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(6,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(6,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(6,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>                                   
+
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(3,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(6,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(7,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(10,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(11,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(14,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(15,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(18,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(19,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(22,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(23,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(26,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(27,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(30,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(31,3)<4>      0:uw                                   
+
+/*	This section will be used if 16-bit output is needed in planar format -vK
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     uwAVS_RESPONSE(2,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(1)<1>     uwAVS_RESPONSE(2,8)<8;4,1>               
+    mov (8)  uwDEST_Y(2)<1>     uwAVS_RESPONSE(3,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(3)<1>     uwAVS_RESPONSE(3,8)<8;4,1>               
+    mov (8)  uwDEST_Y(4)<1>     uwAVS_RESPONSE(8,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(5)<1>     uwAVS_RESPONSE(8,8)<8;4,1>               
+    mov (8)  uwDEST_Y(6)<1>     uwAVS_RESPONSE(9,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(7)<1>     uwAVS_RESPONSE(9,8)<8;4,1>               
+    
+    // Move first 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,0)<8;4,1>                 
+    mov (8) uwDEST_V(1)<1>      ubAVS_RESPONSE(0,8)<8;4,1>               
+    mov (8) uwDEST_V(2)<1>      ubAVS_RESPONSE(1,0)<8;4,1>                 
+    mov (8) uwDEST_V(3)<1>      ubAVS_RESPONSE(1,8)<8;4,1>               
+    mov (8) uwDEST_V(4)<1>      ubAVS_RESPONSE(6,0)<8;4,1>                 
+    mov (8) uwDEST_V(5)<1>      ubAVS_RESPONSE(6,8)<8;4,1>               
+    mov (8) uwDEST_V(6)<1>      ubAVS_RESPONSE(7,0)<8;4,1>                 
+    mov (8) uwDEST_V(7)<1>      ubAVS_RESPONSE(7,8)<8;4,1>               
+    
+    // Move first 8x8 words of U to dest GRF        
+    mov (8) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,0)<8;4,1>          
+    mov (8) uwDEST_U(1)<1>      ubAVS_RESPONSE(4,8)<8;4,1>                
+    mov (8) uwDEST_U(2)<1>      ubAVS_RESPONSE(5,0)<8;4,1>          
+    mov (8) uwDEST_U(3)<1>      ubAVS_RESPONSE(5,8)<8;4,1>                
+    mov (8) uwDEST_U(4)<1>      ubAVS_RESPONSE(10,0)<8;4,1>         
+    mov (8) uwDEST_U(5)<1>      ubAVS_RESPONSE(10,8)<8;4,1>               
+    mov (8) uwDEST_U(6)<1>      ubAVS_RESPONSE(11,0)<8;4,1>         
+    mov (8) uwDEST_U(7)<1>      ubAVS_RESPONSE(11,8)<8;4,1>               
+    
+    // Move second 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0,8)<1>     uwAVS_RESPONSE_2(2,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(1,8)<1>     uwAVS_RESPONSE_2(2,8)<8;4,1>               
+    mov (8)  uwDEST_Y(2,8)<1>     uwAVS_RESPONSE_2(3,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(3,8)<1>     uwAVS_RESPONSE_2(3,8)<8;4,1>               
+    mov (8)  uwDEST_Y(4,8)<1>     uwAVS_RESPONSE_2(8,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(5,8)<1>     uwAVS_RESPONSE_2(8,8)<8;4,1>               
+    mov (8)  uwDEST_Y(6,8)<1>     uwAVS_RESPONSE_2(9,0)<8;4,1>                 
+    mov (8)  uwDEST_Y(7,8)<1>     uwAVS_RESPONSE_2(9,8)<8;4,1>               
+    
+    // Move second 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0,8)<1>      ubAVS_RESPONSE_2(0,0)<8;4,1>                 
+    mov (8) uwDEST_V(1,8)<1>      ubAVS_RESPONSE_2(0,8)<8;4,1>               
+    mov (8) uwDEST_V(2,8)<1>      ubAVS_RESPONSE_2(1,0)<8;4,1>                 
+    mov (8) uwDEST_V(3,8)<1>      ubAVS_RESPONSE_2(1,8)<8;4,1>               
+    mov (8) uwDEST_V(4,8)<1>      ubAVS_RESPONSE_2(6,0)<8;4,1>                 
+    mov (8) uwDEST_V(5,8)<1>      ubAVS_RESPONSE_2(6,8)<8;4,1>               
+    mov (8) uwDEST_V(6,8)<1>      ubAVS_RESPONSE_2(7,0)<8;4,1>                 
+    mov (8) uwDEST_V(7,8)<1>      ubAVS_RESPONSE_2(7,8)<8;4,1>               
+    
+    // Move second 8x8 words of U to dest GRF        
+    mov (8) uwDEST_U(0,8)<1>      ubAVS_RESPONSE_2(4,0)<8;4,1>          
+    mov (8) uwDEST_U(1,8)<1>      ubAVS_RESPONSE_2(4,8)<8;4,1>                
+    mov (8) uwDEST_U(2,8)<1>      ubAVS_RESPONSE_2(5,0)<8;4,1>          
+    mov (8) uwDEST_U(3,8)<1>      ubAVS_RESPONSE_2(5,8)<8;4,1>                
+    mov (8) uwDEST_U(4,8)<1>      ubAVS_RESPONSE_2(10,0)<8;4,1>         
+    mov (8) uwDEST_U(5,8)<1>      ubAVS_RESPONSE_2(10,8)<8;4,1>               
+    mov (8) uwDEST_U(6,8)<1>      ubAVS_RESPONSE_2(11,0)<8;4,1>         
+    mov (8) uwDEST_U(7,8)<1>      ubAVS_RESPONSE_2(11,8)<8;4,1>               
+*/
+#else   /* OUTPUT_8_BIT */
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>               
+
+    // Move first 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;4,2>                 
+    mov (8) uwDEST_V(1)<1>      ubAVS_RESPONSE(0,8+1)<16;4,2>               
+    mov (8) uwDEST_V(2)<1>      ubAVS_RESPONSE(1,1)<16;4,2>                 
+    mov (8) uwDEST_V(3)<1>      ubAVS_RESPONSE(1,8+1)<16;4,2>               
+    mov (8) uwDEST_V(4)<1>      ubAVS_RESPONSE(6,1)<16;4,2>                 
+    mov (8) uwDEST_V(5)<1>      ubAVS_RESPONSE(6,8+1)<16;4,2>               
+    mov (8) uwDEST_V(6)<1>      ubAVS_RESPONSE(7,1)<16;4,2>                 
+    mov (8) uwDEST_V(7)<1>      ubAVS_RESPONSE(7,8+1)<16;4,2>               
+
+    // Move first 8x8 words of U to dest GRF        
+    mov (8) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;4,2>           
+    mov (8) uwDEST_U(1)<1>      ubAVS_RESPONSE(4,8+1)<16;4,2>                 
+    mov (8) uwDEST_U(2)<1>      ubAVS_RESPONSE(5,1)<16;4,2>           
+    mov (8) uwDEST_U(3)<1>      ubAVS_RESPONSE(5,8+1)<16;4,2>                 
+    mov (8) uwDEST_U(4)<1>      ubAVS_RESPONSE(10,1)<16;4,2>          
+    mov (8) uwDEST_U(5)<1>      ubAVS_RESPONSE(10,8+1)<16;4,2>                
+    mov (8) uwDEST_U(6)<1>      ubAVS_RESPONSE(11,1)<16;4,2>          
+    mov (8) uwDEST_U(7)<1>      ubAVS_RESPONSE(11,8+1)<16;4,2>                
+
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>          ubAVS_RESPONSE_2(2,1)<16;4,2>    
+    mov (8) uwDEST_Y(1,8)<1>          ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>          ubAVS_RESPONSE_2(3,1)<16;4,2>     
+    mov (8) uwDEST_Y(3,8)<1>          ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>          ubAVS_RESPONSE_2(8,1)<16;4,2>  
+    mov (8) uwDEST_Y(5,8)<1>          ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>          ubAVS_RESPONSE_2(9,1)<16;4,2>     
+    mov (8) uwDEST_Y(7,8)<1>          ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+
+    // Move second 8x8 words of V to dest GRF        
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE_2(0,1)<16;4,2>           
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE_2(0,8+1)<16;4,2>                 
+    mov (8) uwDEST_V(2,8)<1>          ubAVS_RESPONSE_2(1,1)<16;4,2>           
+    mov (8) uwDEST_V(3,8)<1>          ubAVS_RESPONSE_2(1,8+1)<16;4,2>                 
+    mov (8) uwDEST_V(4,8)<1>          ubAVS_RESPONSE_2(6,1)<16;4,2>           
+    mov (8) uwDEST_V(5,8)<1>          ubAVS_RESPONSE_2(6,8+1)<16;4,2>                 
+    mov (8) uwDEST_V(6,8)<1>          ubAVS_RESPONSE_2(7,1)<16;4,2>           
+    mov (8) uwDEST_V(7,8)<1>          ubAVS_RESPONSE_2(7,8+1)<16;4,2>                 
+
+    // Move second 8x8 words of U to dest GRF        
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE_2(4,1)<16;4,2>             
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE_2(4,8+1)<16;4,2>           
+    mov (8) uwDEST_U(2,8)<1>          ubAVS_RESPONSE_2(5,1)<16;4,2>             
+    mov (8) uwDEST_U(3,8)<1>          ubAVS_RESPONSE_2(5,8+1)<16;4,2>           
+    mov (8) uwDEST_U(4,8)<1>          ubAVS_RESPONSE_2(10,1)<16;4,2>            
+    mov (8) uwDEST_U(5,8)<1>          ubAVS_RESPONSE_2(10,8+1)<16;4,2>          
+    mov (8) uwDEST_U(6,8)<1>          ubAVS_RESPONSE_2(11,1)<16;4,2>            
+    mov (8) uwDEST_U(7,8)<1>          ubAVS_RESPONSE_2(11,8+1)<16;4,2>          
+#endif
+//------------------------------------------------------------------------------
+
+   // Re-define new number of lines
+   #undef nUV_NUM_OF_ROWS
+   #undef nY_NUM_OF_ROWS
+   
+   #define nY_NUM_OF_ROWS      8
+   #define nUV_NUM_OF_ROWS     8
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm
new file mode 100644
index 0000000..01d451d
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm
@@ -0,0 +1,77 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS  
+//  3. AVS_INDIRECT_ACCESS  
+//
+// Only 1 method should stay in the code 
+
+
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+
+
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>               
+
+    // Move first 4x8 words of V to dest GRF  
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>                 
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(1,1)<16;2,4>                 
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(6,1)<16;2,4>                 
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(7,1)<16;2,4>                 
+
+    // Move first 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>           
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(5,1)<16;2,4>           
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(10,1)<16;2,4>          
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(11,1)<16;2,4>          
+
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>    
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>     
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>  
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>     
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+
+    // Move second 4x8 words of V to dest GRF        
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>           
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(1,1)<16;2,4>           
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>           
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(7,1)<16;2,4>           
+
+    // Move second 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>             
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(5,1)<16;2,4>             
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>            
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(11,1)<16;2,4>            
+
+//------------------------------------------------------------------------------
+
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm
new file mode 100644
index 0000000..91b2398
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm
@@ -0,0 +1,93 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_AVS_IEF_Unpack_8x8.asm ----------
+
+// Yoni: In order to optimize unpacking, 3 methods are being checked:
+//  1. AVS_ORIGINAL
+//  2. AVS_ROUND_TO_8_BITS  
+//  3. AVS_INDIRECT_ACCESS  
+//
+// Only 1 method should stay in the code 
+
+
+//#define AVS_ROUND_TO_8_BITS
+//#define AVS_INDIRECT_ACCESS
+
+
+    // Move first 8x8 words of Y to dest GRF
+    mov (8)  uwDEST_Y(0)<1>     ubAVS_RESPONSE(2,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(1)<1>     ubAVS_RESPONSE(2,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(2)<1>     ubAVS_RESPONSE(3,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(3)<1>     ubAVS_RESPONSE(3,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(4)<1>     ubAVS_RESPONSE(8,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(5)<1>     ubAVS_RESPONSE(8,8+1)<16;4,2>               
+    mov (8)  uwDEST_Y(6)<1>     ubAVS_RESPONSE(9,1)<16;4,2>                 
+    mov (8)  uwDEST_Y(7)<1>     ubAVS_RESPONSE(9,8+1)<16;4,2>               
+
+    // Move first 4x8 words of V to dest GRF  
+    mov (4) uwDEST_V(0)<1>      ubAVS_RESPONSE(0,1)<16;2,4>                 
+    mov (4) uwDEST_V(0,8)<1>    ubAVS_RESPONSE(0,8+1)<16;2,4>               
+    mov (4) uwDEST_V(1)<1>      ubAVS_RESPONSE(1,1)<16;2,4>                 
+    mov (4) uwDEST_V(1,8)<1>    ubAVS_RESPONSE(1,8+1)<16;2,4>               
+    mov (4) uwDEST_V(2)<1>      ubAVS_RESPONSE(6,1)<16;2,4>                 
+    mov (4) uwDEST_V(2,8)<1>    ubAVS_RESPONSE(6,8+1)<16;2,4>               
+    mov (4) uwDEST_V(3)<1>      ubAVS_RESPONSE(7,1)<16;2,4>                 
+    mov (4) uwDEST_V(3,8)<1>    ubAVS_RESPONSE(7,8+1)<16;2,4>               
+
+    // Move first 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0)<1>      ubAVS_RESPONSE(4,1)<16;2,4>           
+    mov (4) uwDEST_U(0,8)<1>    ubAVS_RESPONSE(4,8+1)<16;2,4>                 
+    mov (4) uwDEST_U(1)<1>      ubAVS_RESPONSE(5,1)<16;2,4>           
+    mov (4) uwDEST_U(1,8)<1>    ubAVS_RESPONSE(5,8+1)<16;2,4>                 
+    mov (4) uwDEST_U(2)<1>      ubAVS_RESPONSE(10,1)<16;2,4>          
+    mov (4) uwDEST_U(2,8)<1>    ubAVS_RESPONSE(10,8+1)<16;2,4>                
+    mov (4) uwDEST_U(3)<1>      ubAVS_RESPONSE(11,1)<16;2,4>          
+    mov (4) uwDEST_U(3,8)<1>    ubAVS_RESPONSE(11,8+1)<16;2,4>                
+
+    // Move second 8x8 words of Y to dest GRF
+    mov (8) uwDEST_Y(0,8)<1>    ubAVS_RESPONSE_2(2,1)<16;4,2>    
+    mov (8) uwDEST_Y(1,8)<1>    ubAVS_RESPONSE_2(2,8+1)<16;4,2>
+    mov (8) uwDEST_Y(2,8)<1>    ubAVS_RESPONSE_2(3,1)<16;4,2>     
+    mov (8) uwDEST_Y(3,8)<1>    ubAVS_RESPONSE_2(3,8+1)<16;4,2>
+    mov (8) uwDEST_Y(4,8)<1>    ubAVS_RESPONSE_2(8,1)<16;4,2>  
+    mov (8) uwDEST_Y(5,8)<1>    ubAVS_RESPONSE_2(8,8+1)<16;4,2>
+    mov (8) uwDEST_Y(6,8)<1>    ubAVS_RESPONSE_2(9,1)<16;4,2>     
+    mov (8) uwDEST_Y(7,8)<1>    ubAVS_RESPONSE_2(9,8+1)<16;4,2>
+
+    // Move second 4x8 words of V to dest GRF        
+    mov (4) uwDEST_V(0,4)<1>    ubAVS_RESPONSE_2(0,1)<16;2,4>           
+    mov (4) uwDEST_V(0,12)<1>   ubAVS_RESPONSE_2(0,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(1,4)<1>    ubAVS_RESPONSE_2(1,1)<16;2,4>           
+    mov (4) uwDEST_V(1,12)<1>   ubAVS_RESPONSE_2(1,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(2,4)<1>    ubAVS_RESPONSE_2(6,1)<16;2,4>           
+    mov (4) uwDEST_V(2,12)<1>   ubAVS_RESPONSE_2(6,8+1)<16;2,4>                 
+    mov (4) uwDEST_V(3,4)<1>    ubAVS_RESPONSE_2(7,1)<16;2,4>           
+    mov (4) uwDEST_V(3,12)<1>   ubAVS_RESPONSE_2(7,8+1)<16;2,4>                 
+
+    // Move second 4x8 words of U to dest GRF        
+    mov (4) uwDEST_U(0,4)<1>    ubAVS_RESPONSE_2(4,1)<16;2,4>             
+    mov (4) uwDEST_U(0,12)<1>   ubAVS_RESPONSE_2(4,8+1)<16;2,4>           
+    mov (4) uwDEST_U(1,4)<1>    ubAVS_RESPONSE_2(5,1)<16;2,4>             
+    mov (4) uwDEST_U(1,12)<1>   ubAVS_RESPONSE_2(5,8+1)<16;2,4>           
+    mov (4) uwDEST_U(2,4)<1>    ubAVS_RESPONSE_2(10,1)<16;2,4>            
+    mov (4) uwDEST_U(2,12)<1>   ubAVS_RESPONSE_2(10,8+1)<16;2,4>          
+    mov (4) uwDEST_U(3,4)<1>    ubAVS_RESPONSE_2(11,1)<16;2,4>            
+    mov (4) uwDEST_U(3,12)<1>   ubAVS_RESPONSE_2(11,8+1)<16;2,4>          
+
+//------------------------------------------------------------------------------
+
+       // Re-define new number of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm
new file mode 100644
index 0000000..6aa91c8
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm
@@ -0,0 +1,139 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+
+    #ifdef DI_ONLY
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DI               // set the number of GRF 
+	#else
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+	#endif
+	
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4   // DN Block Size for Write is 32x4
+    
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    //// move the previous frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the previous frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+    //// move the current frame Y component to internal planar format
+    //$for (0; <nY_NUM_OF_ROWS/2; 1) {
+    //    mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    //}
+    //// move the current frame U,V components to internal planar format
+    //$for (0; <nUV_NUM_OF_ROWS/2; 1) {
+    //    mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+    //    mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    //}
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     NODDCLR          // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    NODDCLR_NODDCHK // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud          NODDCHK         // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#ifdef DI_ONLY
+#else
+
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+    // check top/bottom field first
+	cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+	
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:uw
+	//set the save DN position
+    shl (1)     rMSGSRC.0<1>:ud      wORIX<0;1,0>:w          1:w NODDCLR           // X origin * 2
+    mov (1)     rMSGSRC.1<1>:ud      wORIY<0;1,0>:w              NODDCLR_NODDCHK   // Y origin
+    mov (1)     rMSGSRC.2<1>:ud      nDPW_BLOCK_SIZE_DN:ud       NODDCHK             // block width and height (8x4)
+    mov (8)     mudMSGHDR_DN(0)<1>   rMSGSRC.0<8;8,1>:ud
+	
+    
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    //}
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub   ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3)
+    }
+
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3)
+    }
+
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+    //    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+    //    mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    //    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+    //    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+    //    mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    //    mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    //}
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub       ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2)
+        mov (16)    r[pCF_Y_OFFSET,  %1+1*32]<2>:ub     ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3)
+    }
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2)
+        mov (8)     r[pCF_U_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3)
+    }
+	$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub       ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2)
+        mov (8)     r[pCF_V_OFFSET,  %1+1*32]<4>:ub     ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3)
+    }
+	
+SAVE_DN_CURR:
+    $for(0; <nY_NUM_OF_ROWS/2; 1) {
+            mov (8) mudMSGHDR_DN(%1+1)<1>  udDN_YUV(%1)REGION(8,1)
+    }
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_DI+nBI_DESTINATION_YUV:ud
+#endif
+
+// Save Processed frames
+#include "DI_Save_PA.asm"      
+
+
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm
new file mode 100644
index 0000000..ef88a3c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm
@@ -0,0 +1,54 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_DISABLE
+
+#include "DNDI.inc"
+
+#undef  nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS         8                                 // Number of Y rows per block
+#undef  nUV_NUM_OF_ROWS
+#define nUV_NUM_OF_ROWS        8                                 // Number of U/V rows per block
+
+#undef   nSMPL_RESP_LEN
+#define  nSMPL_RESP_LEN        nSMPL_RESP_LEN_DN_PA              // Set the Number of GRFs in DNDI response 
+#undef   nDPW_BLOCK_SIZE_DN
+#define  nDPW_BLOCK_SIZE_DN    nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8   // DN Curr Block Size for Write is 32x8
+#undef   nDPW_BLOCK_SIZE_HIST
+#define  nDPW_BLOCK_SIZE_HIST  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2    // HIST Block Size for Write is 4x2
+
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+#include "DNDI_COMMAND.asm"
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#include "DNDI_Hist_Save.asm"
+
+////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run ///////////////
+add (4)     pCF_Y_OFFSET<1>:uw    ubDEST_CF_OFFSET<4;4,1>:ub    npDN_YUV:w 
+$for (0; <nY_NUM_OF_ROWS; 1) {
+    mov (16)    r[pCF_Y_OFFSET,  %1*32]<2>:ub   ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1>       // copy line of Y
+}
+$for (0; <nUV_NUM_OF_ROWS; 1) {
+    mov (8)     r[pCF_U_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16+1)<16;8,2>    // copy line of U
+    mov (8)     r[pCF_V_OFFSET,  %1*32]<4>:ub   ubRESP(nNODI_CHROMA_OFFSET,%1*16)<16;8,2>      // copy line of V
+}
+
+shl (1)     rMSGSRC.0<1>:ud     wORIX<0;1,0>:w     1:w       // X origin * 2 (422 output)
+mov (1)     rMSGSRC.1<1>:ud     wORIY<0;1,0>:w               // Y origin
+mov (1)     rMSGSRC.2<1>:ud     nDPW_BLOCK_SIZE_DN:ud        // block width and height (32x8)
+mov (8)     mMSGHDR_DN<1>:ud    rMSGSRC<8;8,1>:ud            // message header   
+
+$for(0; <nY_NUM_OF_ROWS; 2) {
+        mov (16) mudMSGHDR_DN(1+%1)<1>  udDN_YUV(%1)REGION(8,1)    // Move DN Curr to MRF
+}
+send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_NODI+nBI_DESTINATION_YUV:ud     
+
+
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm
new file mode 100644
index 0000000..c2a1b1e
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm
@@ -0,0 +1,70 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PA_Scaling.asm ----------
+#include "Scaling.inc"
+
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+
+//Module: PrepareScaleCoord.asm
+
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+	
+//------------------------------------------------------------------------------
+
+$for (0; <nY_NUM_OF_ROWS; 1) {
+
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_YUV+nBI_CURRENT_SRC_YUV
+
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(2)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(2)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(4)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(4)<1>	acc0:f														{ Compr }
+
+	mov	 (16) 	DEST_V(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_Y(%1)<1>				SCALE_RESPONSE_YB(2)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_U(%1)<1>				SCALE_RESPONSE_YB(4)											//possible error due to truncation - vK
+
+}
+
+	#define nSRC_REGION				nREGION_1
+
+//------------------------------------------------------------------------------
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm
new file mode 100644
index 0000000..2f7b735
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm
@@ -0,0 +1,60 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_16x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 2 sampler read for 8x8 U and 8x8 V (NV11\P208 input surface)
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 8x8 U and V sampling 
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+
+    // 2nd 8x8 U and V sampling 
+    // Enable red and blue channels
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_16x8.asm"
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm
new file mode 100644
index 0000000..9b221e7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm
@@ -0,0 +1,58 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_8x4.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 8x8 U and V sampling 
+    // Enable red and blue channels  
+    //Only 8x4 wil be used  
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X 
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:0 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x4.asm"
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm
new file mode 100644
index 0000000..404fbd0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm
@@ -0,0 +1,57 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_8x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y each
+// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface)
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud               
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 8x8 U and V sampling 
+    // Enable red and blue channels    
+    mov (1) rAVS_8x8_HDR.2:ud  nAVS_RED_BLUE_CHANNELS:ud                   
+
+    // Calculate Chroma Step Size:
+    // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X 
+    // for V direction: 8  Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y
+    mul  (1)  rAVS_PAYLOAD.1:f      fVIDEO_STEP_X:f    2.0:f             // Step X for chroma
+
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV   udDUMMY_NULL  nSMPL_ENGINE    nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV
+    // Return U and V in 8 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    // Enable green channel only
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud                           
+
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>    mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:2 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL2_AVS_IEF_Unpack_8x8.asm"
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm
new file mode 100644
index 0000000..6c994c1
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm
@@ -0,0 +1,271 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_Unpack_16x8.asm ----------
+        
+#ifdef AVS_OUTPUT_16_BIT	//Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(2,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(2,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(2,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(2,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(3,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(3,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>                                   
+
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>                                   
+
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(6,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(6,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(6,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(6,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(7,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(7,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(7,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(7,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(11,12)<4;4,1>                                   
+
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(1,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(4,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(5,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(8,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(9,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(12,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(13,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(16,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(17,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(20,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(21,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(24,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(25,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(28,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(29,3)<4>      0:uw                                   
+
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(2,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(2,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(2,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(2,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>                                   
+
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>                                   
+
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>                                   
+
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(3,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(6,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(7,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(10,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(11,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(14,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(15,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(18,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(19,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(22,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(23,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(26,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(27,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(30,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(31,3)<4>      0:uw                                   
+
+/*	This section will be used if 16-bit output is needed in planar format -vK
+     // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        uwAVS_RESPONSE(%1,0)<8;4,1>     
+        mov (8) uwDEST_Y(%1*2+1)<1>      uwAVS_RESPONSE(%1,8)<8;4,1>   
+    } 
+    
+    // Move 1st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>           uwAVS_RESPONSE(4,0)<8;4,1>      
+    mov (8) uwDEST_U(1)<1>           uwAVS_RESPONSE(4,8)<8;4,1>    
+    mov (8) uwDEST_U(2)<1>           uwAVS_RESPONSE(5,0)<8;4,1>      
+    mov (8) uwDEST_U(3)<1>           uwAVS_RESPONSE(5,8)<8;4,1>    
+    mov (8) uwDEST_U(4)<1>           uwAVS_RESPONSE(8,0)<8;4,1>      
+    mov (8) uwDEST_U(5)<1>           uwAVS_RESPONSE(8,8)<8;4,1>    
+    mov (8) uwDEST_U(6)<1>           uwAVS_RESPONSE(9,0)<8;4,1>      
+    mov (8) uwDEST_U(7)<1>           uwAVS_RESPONSE(9,8)<8;4,1>    
+
+    // Move 1st 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>           uwAVS_RESPONSE(6,0)<8;4,1>      
+    mov (8) uwDEST_V(1)<1>           uwAVS_RESPONSE(6,8)<8;4,1>    
+    mov (8) uwDEST_V(2)<1>           uwAVS_RESPONSE(7,0)<8;4,1>      
+    mov (8) uwDEST_V(3)<1>           uwAVS_RESPONSE(7,8)<8;4,1>    
+    mov (8) uwDEST_V(4)<1>           uwAVS_RESPONSE(10,0)<8;4,1>     
+    mov (8) uwDEST_V(5)<1>           uwAVS_RESPONSE(10,8)<8;4,1>   
+    mov (8) uwDEST_V(6)<1>           uwAVS_RESPONSE(11,0)<8;4,1>     
+    mov (8) uwDEST_V(7)<1>           uwAVS_RESPONSE(11,8)<8;4,1>   
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      uwAVS_RESPONSE_2(%1,0)<8;4,1> 
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    uwAVS_RESPONSE_2(%1,8)<8;4,1> 
+    } 
+
+    // Move 2st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0,8)<1>         uwAVS_RESPONSE_2(4,0)<8;4,1>      
+    mov (8) uwDEST_U(1,8)<1>         uwAVS_RESPONSE_2(4,8)<8;4,1>    
+    mov (8) uwDEST_U(2,8)<1>         uwAVS_RESPONSE_2(5,0)<8;4,1>      
+    mov (8) uwDEST_U(3,8)<1>         uwAVS_RESPONSE_2(5,8)<8;4,1>    
+    mov (8) uwDEST_U(4,8)<1>         uwAVS_RESPONSE_2(8,0)<8;4,1>      
+    mov (8) uwDEST_U(5,8)<1>         uwAVS_RESPONSE_2(8,8)<8;4,1>    
+    mov (8) uwDEST_U(6,8)<1>         uwAVS_RESPONSE_2(9,0)<8;4,1>      
+    mov (8) uwDEST_U(7,8)<1>         uwAVS_RESPONSE_2(9,8)<8;4,1>    
+
+    // Move 2st 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0,8)<1>         uwAVS_RESPONSE_2(6,0)<8;4,1>      
+    mov (8) uwDEST_V(1,8)<1>         uwAVS_RESPONSE_2(6,8)<8;4,1>    
+    mov (8) uwDEST_V(2,8)<1>         uwAVS_RESPONSE_2(7,0)<8;4,1>      
+    mov (8) uwDEST_V(3,8)<1>         uwAVS_RESPONSE_2(7,8)<8;4,1>    
+    mov (8) uwDEST_V(4,8)<1>         uwAVS_RESPONSE_2(10,0)<8;4,1>     
+    mov (8) uwDEST_V(5,8)<1>         uwAVS_RESPONSE_2(10,8)<8;4,1>   
+    mov (8) uwDEST_V(6,8)<1>         uwAVS_RESPONSE_2(11,0)<8;4,1>     
+    mov (8) uwDEST_V(7,8)<1>         uwAVS_RESPONSE_2(11,8)<8;4,1>   
+*/
+#else
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 1st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>           ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(1)<1>           ubAVS_RESPONSE(4,8+1)<16;4,2>    
+    mov (8) uwDEST_U(2)<1>           ubAVS_RESPONSE(5,1)<16;4,2>      
+    mov (8) uwDEST_U(3)<1>           ubAVS_RESPONSE(5,8+1)<16;4,2>    
+    mov (8) uwDEST_U(4)<1>           ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_U(5)<1>           ubAVS_RESPONSE(8,8+1)<16;4,2>    
+    mov (8) uwDEST_U(6)<1>           ubAVS_RESPONSE(9,1)<16;4,2>      
+    mov (8) uwDEST_U(7)<1>           ubAVS_RESPONSE(9,8+1)<16;4,2>    
+
+    // Move 1st 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>           ubAVS_RESPONSE(6,1)<16;4,2>      
+    mov (8) uwDEST_V(1)<1>           ubAVS_RESPONSE(6,8+1)<16;4,2>    
+    mov (8) uwDEST_V(2)<1>           ubAVS_RESPONSE(7,1)<16;4,2>      
+    mov (8) uwDEST_V(3)<1>           ubAVS_RESPONSE(7,8+1)<16;4,2>    
+    mov (8) uwDEST_V(4)<1>           ubAVS_RESPONSE(10,1)<16;4,2>     
+    mov (8) uwDEST_V(5)<1>           ubAVS_RESPONSE(10,8+1)<16;4,2>   
+    mov (8) uwDEST_V(6)<1>           ubAVS_RESPONSE(11,1)<16;4,2>     
+    mov (8) uwDEST_V(7)<1>           ubAVS_RESPONSE(11,8+1)<16;4,2>   
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    } 
+
+    // Move 2st 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0,8)<1>         ubAVS_RESPONSE_2(4,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>         ubAVS_RESPONSE_2(4,8+1)<16;4,2>    
+    mov (8) uwDEST_U(2,8)<1>         ubAVS_RESPONSE_2(5,1)<16;4,2>      
+    mov (8) uwDEST_U(3,8)<1>         ubAVS_RESPONSE_2(5,8+1)<16;4,2>    
+    mov (8) uwDEST_U(4,8)<1>         ubAVS_RESPONSE_2(8,1)<16;4,2>      
+    mov (8) uwDEST_U(5,8)<1>         ubAVS_RESPONSE_2(8,8+1)<16;4,2>    
+    mov (8) uwDEST_U(6,8)<1>         ubAVS_RESPONSE_2(9,1)<16;4,2>      
+    mov (8) uwDEST_U(7,8)<1>         ubAVS_RESPONSE_2(9,8+1)<16;4,2>    
+
+    // Move 2st 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0,8)<1>         ubAVS_RESPONSE_2(6,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>         ubAVS_RESPONSE_2(6,8+1)<16;4,2>    
+    mov (8) uwDEST_V(2,8)<1>         ubAVS_RESPONSE_2(7,1)<16;4,2>      
+    mov (8) uwDEST_V(3,8)<1>         ubAVS_RESPONSE_2(7,8+1)<16;4,2>    
+    mov (8) uwDEST_V(4,8)<1>         ubAVS_RESPONSE_2(10,1)<16;4,2>     
+    mov (8) uwDEST_V(5,8)<1>         ubAVS_RESPONSE_2(10,8+1)<16;4,2>   
+    mov (8) uwDEST_V(6,8)<1>         ubAVS_RESPONSE_2(11,1)<16;4,2>     
+    mov (8) uwDEST_V(7,8)<1>         ubAVS_RESPONSE_2(11,8+1)<16;4,2>   
+#endif
+
+       // Re-define new # of lines
+       #undef nUV_NUM_OF_ROWS
+       #undef nY_NUM_OF_ROWS
+       
+       #define nY_NUM_OF_ROWS      8
+       #define nUV_NUM_OF_ROWS     8
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm
new file mode 100644
index 0000000..37202f4
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm
@@ -0,0 +1,45 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_8x4.asm ----------
+        
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 8x4 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(5,1)<16;4,2>    
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(9,1)<16;4,2>    
+
+    // Move 8x4 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>      
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(7,1)<16;4,2>    
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(10,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(11,1)<16;4,2>    
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    } 
+
+//------------------------------------------------------------------------------
+
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+   
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     4
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm
new file mode 100644
index 0000000..ec9f754
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm
@@ -0,0 +1,53 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_AVS_IEF_8x8.asm ----------
+        
+    // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>        ubAVS_RESPONSE(%1,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>      ubAVS_RESPONSE(%1,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 8x8 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(4,8+1)<16;4,2>    
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(5,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(5,8+1)<16;4,2>    
+    mov (8) uwDEST_U(2)<1>            ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_U(2,8)<1>          ubAVS_RESPONSE(8,8+1)<16;4,2>    
+    mov (8) uwDEST_U(3)<1>            ubAVS_RESPONSE(9,1)<16;4,2>      
+    mov (8) uwDEST_U(3,8)<1>          ubAVS_RESPONSE(9,8+1)<16;4,2>    
+
+    // Move 8x8 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(6,1)<16;4,2>      
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(6,8+1)<16;4,2>    
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(7,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(7,8+1)<16;4,2>    
+    mov (8) uwDEST_V(2)<1>            ubAVS_RESPONSE(10,1)<16;4,2>     
+    mov (8) uwDEST_V(2,8)<1>          ubAVS_RESPONSE(10,8+1)<16;4,2>   
+    mov (8) uwDEST_V(3)<1>            ubAVS_RESPONSE(11,1)<16;4,2>     
+    mov (8) uwDEST_V(3,8)<1>          ubAVS_RESPONSE(11,8+1)<16;4,2>   
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>      ubAVS_RESPONSE_2(%1,1)<16;4,2>    // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>    ubAVS_RESPONSE_2(%1,8+1)<16;4,2>  // Copy high byte in a word
+    } 
+
+//------------------------------------------------------------------------------
+
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+   
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm
new file mode 100644
index 0000000..7849afd
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm
@@ -0,0 +1,71 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL2_Scaling.asm ----------
+#include "Scaling.inc"
+
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+				
+//Module: PrepareScaleCoord.asm
+
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+
+//------------------------------------------------------------------------------
+
+$for (0; <nY_NUM_OF_ROWS; 1) {
+
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
+	send (16)	SCALE_RESPONSE_UW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_UV+nBI_CURRENT_SRC_UV
+
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_UF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_UD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_UF(2)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_UD(2)<1>	acc0:f														{ Compr }
+
+	mov	 (16) 	DEST_Y(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_U(%1)<1>				SCALE_RESPONSE_UB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_V(%1)<1>				SCALE_RESPONSE_UB(2)											//possible error due to truncation - vK
+
+}
+
+	#define nSRC_REGION				nREGION_1
+
+//------------------------------------------------------------------------------
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm
new file mode 100644
index 0000000..50a050c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm
@@ -0,0 +1,69 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_16x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface 
+// 2 sampler read  for 8x8 U surface 
+// 2 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // 1st 8x8 Y sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>  mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 1st 8x8 U sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs    
+    send (1) uwAVS_RESPONSE(4)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+
+    // 1st 8x8 V sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction to avoid back-2-back send instructions
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1>   mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+    // Return Y in 4 GRFs
+
+    // 2nd 8x8 U sampling 
+    mov  (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(4)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+        
+    mov (1) rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+
+    // 2nd 8x8 V sampling 
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(8)<1>     mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:4:4 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_16x8.asm"
+    
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm
new file mode 100644
index 0000000..35a5dd3
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm
@@ -0,0 +1,60 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_8x4.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface 
+// 1 sampler read  for 8x8 U surface 
+// 1 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // 1st 8x8 Y sampling                                                       
+    mov (1) rAVS_8x8_HDR.2:ud       nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 8x8 U sampling ; Only 8x4 will be used
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mul (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f    2.0:f    // Calculate Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+
+    // 8x8 V sampling ; Only 8x4 will be used
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+
+   // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f             // Restore Step X for luma
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(12)<1>  mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+    // Return Y in 4 GRFs
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:0 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_8x4.asm"
+    
+
+                    
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm
new file mode 100644
index 0000000..d67ad04
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm
@@ -0,0 +1,60 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_8x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 Y surface 
+// 1 sampler read  for 8x8 U surface 
+// 1 sampler read  for 8x8 V surface
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    // 1st 8x8 Y sampling                                                       
+    mov (1) rAVS_8x8_HDR.2:ud       nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y
+    // Return Y in 4 GRFs
+
+    // 8x8 U sampling 
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Enable red channel
+    mul (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f    2.0:f    // Calculate Step X for chroma
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(4)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U
+    // Return U in 4 GRFs
+
+    // 8x8 V sampling 
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_RED_CHANNEL_ONLY:ud     // Dummy instruction just in order to avoid back-2-back send instructions!
+    mov (16) mAVS_8x8_HDR_UV.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(8)<1>   mAVS_8x8_HDR_UV   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V
+    // Return V in 4 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+
+    // 2nd 8x8 Y sampling
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_GREEN_CHANNEL_ONLY:ud   // Enable green channel
+    mov (1)  rAVS_PAYLOAD.1:f       fVIDEO_STEP_X:f             // Restore Step X for luma
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(12)<1>  mAVS_8x8_HDR   udDUMMY_NULL      nSMPL_ENGINE        nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y 
+    // Return Y in 4 GRFs
+
+//------------------------------------------------------------------------------
+// Unpacking sampler reads to 4:2:2 internal planar 
+//------------------------------------------------------------------------------
+    #include "PL3_AVS_IEF_Unpack_8x8.asm"
+    
+
+                    
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm
new file mode 100644
index 0000000..f88ab89
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm
@@ -0,0 +1,240 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_Unpack_16x8.asm ----------
+        
+#ifdef AVS_OUTPUT_16_BIT	//Output is packed in AVYU format
+// Move first 8x8 words of Y to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(2,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(2,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(2,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(2,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(3,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(3,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>                                   
+
+// Move first 8x8 words of U to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(6,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(6,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(6,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(6,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(7,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(7,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(7,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(7,12)<4;4,1>                                   
+
+// Move first 8x8 words of V to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(8,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(8,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(8,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(8,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(9,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(9,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(9,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(9,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(11,12)<4;4,1>                                   
+
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(1,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(4,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(5,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(8,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(9,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(12,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(13,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(16,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(17,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(20,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(21,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(24,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(25,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(28,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(29,3)<4>      0:uw                                   
+
+// Move second 8x8 words of Y to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(2,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(2,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(2,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(2,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>                                   
+
+// Move second 8x8 words of U to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(6,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(6,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(6,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(6,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>                                   
+
+// Move second 8x8 words of V to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(8,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(8,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(8,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(8,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>                                   
+
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(3,3)<4>       0:uw                                    
+    mov (4) uwDEST_Y(6,3)<4>       0:uw                                  
+    mov (4) uwDEST_Y(7,3)<4>       0:uw                                   
+    mov (4) uwDEST_Y(10,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(11,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(14,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(15,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(18,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(19,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(22,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(23,3)<4>      0:uw                                   
+    mov (4) uwDEST_Y(26,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(27,3)<4>      0:uw                                    
+    mov (4) uwDEST_Y(30,3)<4>      0:uw                                  
+    mov (4) uwDEST_Y(31,3)<4>      0:uw                                   
+
+/*	This section will be used if 16-bit output is needed in planar format -vK
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          uwAVS_RESPONSE(%1)<8;4,1>        
+        mov (8) uwDEST_Y(%1*2+1)<1>        uwAVS_RESPONSE(%1,8)<8;4,1>      
+    } 
+
+    // Move 8x8 words of U to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2)<1>          uwAVS_RESPONSE(%1+4)<8;4,1>  
+        mov (8) uwDEST_U(%1*2+1)<1>        uwAVS_RESPONSE(%1+4,8)<8;4,1> 
+    } 
+
+    // Move 8x8 words of V to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2)<1>          uwAVS_RESPONSE(%1+8)<8;4,1>      
+        mov (8) uwDEST_V(%1*2+1)<1>        uwAVS_RESPONSE(%1+8,8)<8;4,1>    
+    } 
+
+    // Move 2nd 8x8 words of Y to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>          uwAVS_RESPONSE_2(%1)<8;4,1>        
+        mov (8) uwDEST_Y(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1,8)<8;4,1>      
+    } 
+
+    // Move 2nd 8x8 words of U to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2,8)<1>          uwAVS_RESPONSE_2(%1+4)<8;4,1>  
+        mov (8) uwDEST_U(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1+4,8)<8;4,1> 
+    } 
+
+    // Move 2nd 8x8 words of V to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2,8)<1>          uwAVS_RESPONSE_2(%1+8)<8;4,1>      
+        mov (8) uwDEST_V(%1*2+1,8)<1>        uwAVS_RESPONSE_2(%1+8,8)<8;4,1>    
+    } 
+*/
+#else /* OUTPUT_8_BIT */
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+    } 
+
+    // Move 8x8 words of U to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2)<1>          ubAVS_RESPONSE(%1+4,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_U(%1*2+1)<1>        ubAVS_RESPONSE(%1+4,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 8x8 words of V to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2)<1>          ubAVS_RESPONSE(%1+8,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_V(%1*2+1)<1>        ubAVS_RESPONSE(%1+8,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>          ubAVS_RESPONSE_2(%1,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1,8+1)<16;4,2>   // Copy high byte in a word
+    } 
+
+    // Move 2nd 8x8 words of U to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1*2,8)<1>          ubAVS_RESPONSE_2(%1+4,1)<16;4,2>   // Copy high byte in a word
+        mov (8) uwDEST_U(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1+4,8+1)<16;4,2> // Copy high byte in a word
+    } 
+
+    // Move 2nd 8x8 words of V to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1*2,8)<1>          ubAVS_RESPONSE_2(%1+8,1)<16;4,2>   // Copy high byte in a word
+        mov (8) uwDEST_V(%1*2+1,8)<1>        ubAVS_RESPONSE_2(%1+8,8+1)<16;4,2> // Copy high byte in a word
+    } 
+#endif
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+      
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
+                   
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm
new file mode 100644
index 0000000..53586e6
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm
@@ -0,0 +1,45 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_Unpack_8x4.asm ----------
+        
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+	}
+
+    // Move 8x4 words of U to dest GRF  (Copy high byte in a word)
+    mov (8) uwDEST_U(0)<1>            ubAVS_RESPONSE(4,1)<16;4,2>      
+    mov (8) uwDEST_U(0,8)<1>          ubAVS_RESPONSE(4,9)<16;4,2>    
+    mov (8) uwDEST_U(1)<1>            ubAVS_RESPONSE(5,1)<16;4,2>      
+    mov (8) uwDEST_U(1,8)<1>          ubAVS_RESPONSE(5,9)<16;4,2>    
+
+    // Move 8x4 words of V to dest GRF  
+    mov (8) uwDEST_V(0)<1>            ubAVS_RESPONSE(8,1)<16;4,2>      
+    mov (8) uwDEST_V(0,8)<1>          ubAVS_RESPONSE(8,9)<16;4,2>    
+    mov (8) uwDEST_V(1)<1>            ubAVS_RESPONSE(9,1)<16;4,2>      
+    mov (8) uwDEST_V(1,8)<1>          ubAVS_RESPONSE(9,9)<16;4,2>    
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>        ubAVS_RESPONSE(%1+12,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>      ubAVS_RESPONSE(%1+12,8+1)<16;4,2>   // Copy high byte in a word
+    } 
+
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+       
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     4
+                    
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm
new file mode 100644
index 0000000..f16d04a
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm
@@ -0,0 +1,44 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_AVS_IEF_Unpack_8x8.asm ----------
+        
+    // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2)<1>          ubAVS_RESPONSE(%1,1)<16;4,2>        // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1)<1>        ubAVS_RESPONSE(%1,8+1)<16;4,2>      // Copy high byte in a word
+	}
+    // Move 8x8 words of U to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_U(%1)<1>            ubAVS_RESPONSE(%1+4,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_U(%1,8)<1>          ubAVS_RESPONSE(%1+4,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 8x8 words of V to dest GRF  
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_V(%1)<1>            ubAVS_RESPONSE(%1+8,1)<16;4,2>      // Copy high byte in a word
+        mov (8) uwDEST_V(%1,8)<1>          ubAVS_RESPONSE(%1+8,8+1)<16;4,2>    // Copy high byte in a word
+    } 
+
+    // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF.
+    $for(0; <8/2; 1) {
+        mov (8) uwDEST_Y(%1*2,8)<1>        ubAVS_RESPONSE(%1+12,1)<16;4,2>     // Copy high byte in a word
+        mov (8) uwDEST_Y(%1*2+1,8)<1>      ubAVS_RESPONSE(%1+12,8+1)<16;4,2>   // Copy high byte in a word
+    } 
+
+//------------------------------------------------------------------------------
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+       
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
+                    
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm
new file mode 100644
index 0000000..3d5c689
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm
@@ -0,0 +1,72 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- PL3_Scaling.asm ----------
+#include "Scaling.inc"
+
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		{ NoDDClr }//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	{ NoDDChk }//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+
+			
+//Module: PrepareScaleCoord.asm
+
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+
+//------------------------------------------------------------------------------
+
+$for (0; <nY_NUM_OF_ROWS; 1) {
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE<1>:ud      	rMSGSRC<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_VW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_V+nBI_CURRENT_SRC_V
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y
+	send (16)	SCALE_RESPONSE_UW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_U+nBI_CURRENT_SRC_U
+
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_VF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_VD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_UF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_UD(0)<1>	acc0:f														{ Compr }
+
+	mov	 (16) 	DEST_V(%1)<1>				SCALE_RESPONSE_VB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_Y(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_U(%1)<1>				SCALE_RESPONSE_UB(0)											//possible error due to truncation - vK
+
+}
+
+	#define nSRC_REGION				nREGION_1
+
+//------------------------------------------------------------------------------
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm
new file mode 100644
index 0000000..e6d8fb2
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm
@@ -0,0 +1,85 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+
+    #ifdef DI_ONLY
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DI               // set the number of GRF 
+	#else
+		#undef  nSMPL_RESP_LEN
+		#define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+	#endif
+	
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w  NODDCLR_NODDCHK             // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                 NODDCLR_NODDCHK    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud        NODDCHK             // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+#ifdef DI_ONLY
+#else
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    
+	//set the save DN parameters
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w              NODDCLR             // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud       NODDCLR_NODDCHK     // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud                     
+	
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+	
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+    }
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+	
+SAVE_DN_CURR:
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+#endif
+
+// Save Processed frames
+#include "DI_Save_PA.asm"      
+
+
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm
new file mode 100644
index 0000000..96aed78
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm
@@ -0,0 +1,103 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+    
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV			nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4   //  DN Block Size for UV Write/Read is 8x4
+   
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+
+
+/////////////////////////////P208 UV Copy 422/////////////////////////////////////////////////////
+		//Read UV through DATAPORT    
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (1)  rMSGSRC.0<1>:d     rMSGSRC.0<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x2)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud
+
+ 		//Write UV through DATAPORT
+		mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+		asr (1)     rMSGSRC.0<1>:d    rMSGSRC.0<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (16x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)			mudMSGHDR_DN(1)<1>		 udBOT_U_IO(0)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm
new file mode 100644
index 0000000..69330ba
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm
@@ -0,0 +1,103 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+    
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV			nBLOCK_WIDTH_16+nBLOCK_HEIGHT_2   // DN Block Size for UV Write/Read is 16x2
+   
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_COMMAND.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+
+
+/////////////////////////////NV12 UV Copy 422/////////////////////////////////////////////////////
+		//Read UV through DATAPORT    
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (1)  rMSGSRC.1<1>:d     rMSGSRC.1<0;1,0>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (16x2)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud
+
+ 		//Write UV through DATAPORT
+		mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+		asr (1)     rMSGSRC.1<1>:d         rMSGSRC.1<0;1,0>:d    1:w  // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (16x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)			mudMSGHDR_DN(1)<1>		 udBOT_U_IO(0)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud 
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm
new file mode 100644
index 0000000..7fba14c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm
@@ -0,0 +1,101 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+    
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+   
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+
+
+/////////////////////////////P208 UV Copy 422/////////////////////////////////////////////////////
+		//Read UV through DATAPORT    
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    mov  (1) rMSGSRC.2<1>:ud	nDPW_BLOCK_SIZE_DN:ud							// Y Block width and height (16x4) (U/V block size is the same)
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (8) udBOT_U_IO(0)<1>     mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_2+nBI_CURRENT_SRC_UV:ud
+
+ 		//Write UV through DATAPORT
+		mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (8)			mudMSGHDR_DN(1)<1>		 udBOT_U_IO(0)<8;8,1>
+    mov (8)			mudMSGHDR_DN(2)<1>		 udBOT_U_IO(1)<8;8,1>
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_UV:ud
+ 
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm
new file mode 100644
index 0000000..f7b891d
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm
@@ -0,0 +1,106 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_ENABLE
+
+    #include "DNDI.inc"
+    
+    #undef  nY_NUM_OF_ROWS
+    #define nY_NUM_OF_ROWS      8       // Number of Y rows per block (4 rows for each frame) 
+    #undef  nUV_NUM_OF_ROWS
+    #define nUV_NUM_OF_ROWS     8       // Number of U/V rows per block
+
+    #undef  nSMPL_RESP_LEN
+    #define nSMPL_RESP_LEN          nSMPL_RESP_LEN_DNDI               // set the number of GRF 
+    #undef  nDPW_BLOCK_SIZE_HIST
+    #define nDPW_BLOCK_SIZE_HIST    nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1    // HIST Block Size for Write is 4x2
+    #undef  nDPW_BLOCK_SIZE_DN
+    #define nDPW_BLOCK_SIZE_DN      nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4   // DN Block Size for Write is 16x4
+    #undef  nDPR_BLOCK_SIZE_UV
+    #define nDPR_BLOCK_SIZE_UV			nBLOCK_WIDTH_8+nBLOCK_HEIGHT_2   //  DN Block Size for UV Write/Read is 8x2
+   
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+    #include "DNDI_Command.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+    // move the previous frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1,0)<1>    ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the previous frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(0,%1*8)<1>   ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+    // move the current frame Y component to internal planar format
+    $for (0; <nY_NUM_OF_ROWS/2; 1) {
+        mov (16) uwDEST_Y(%1+4,0)<1>  ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16)
+    }
+    // move the current frame U,V components to internal planar format
+    $for (0; <nUV_NUM_OF_ROWS/2; 1) {
+        mov (8) uwDEST_U(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2>  //U pixels
+        mov (8) uwDEST_V(2,%1*8)<1>   ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2>    //V pixels
+    }
+
+////////////////////////////////////// Save the STMM Data for Next Run /////////////////////////
+    // Write STMM to memory
+    shr (1)     rMSGSRC.0<1>:ud        wORIX<0;1,0>:w            1:w     // X origin / 2
+    mov (1)     rMSGSRC.1<1>:ud        wORIY<0;1,0>:w                    // Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_STMM:ud           // block width and height (8x4)
+    mov (8)     mudMSGHDR_STMM(0)<1>   rMSGSRC.0<8;8,1>:ud               // message header   
+    mov (8)     mudMSGHDR_STMM(1)<1>   udRESP(nDI_STMM_OFFSET,0)         // Move STMM to MRF 
+    send (8)    dNULLREG               mMSGHDR_STMM              udDUMMY_NULL    nDATAPORT_WRITE     nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud      
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+    #include "DI_Hist_Save.asm"
+
+////////////////////////////////////// Save the DN Curr Frame for Next Run ////////////////////////
+    add (4)     pCF_Y_OFFSET<1>:uw          ubSRC_CF_OFFSET<4;4,1>:ub  npDN_YUV:w
+    // check top/bottom field first
+    cmp.e.f0.0 (1)  null<1>:w               ubTFLD_FIRST<0;1,0>:ub     1:w
+    (f0.0) jmpi (1) TOP_FIELD_FIRST
+
+BOTTOM_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+    jmpi (1) SAVE_DN_CURR
+    
+TOP_FIELD_FIRST:
+    $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) {
+        mov (4)     mudMSGHDR_DN(1,%1*4)<1>     udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2)
+        mov (4)     mudMSGHDR_DN(1,%1*4+4)<1>   udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3)
+    }
+SAVE_DN_CURR:
+    mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+    mov (1)     rMSGSRC.2<1>:ud        nDPW_BLOCK_SIZE_DN:ud        // block width and height (16x4)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    send (8)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud
+
+
+/////////////////////////////IMC3 UV Copy 422/////////////////////////////////////////////////////
+		//Read UV through DATAPORT    
+    add  (2) rMSGSRC.0<1>:d     wORIX<2;2,1>:w    wSRC_H_ORI_OFFSET<2;2,1>:w       // Source Y Block origin
+    asr (2)  rMSGSRC.0<1>:d     rMSGSRC.0<2;2,1>:d       1:w   // U/V block origin should be half of Y's
+    mov (1)  rMSGSRC.2<1>:ud    nDPR_BLOCK_SIZE_UV:ud          // U/V block width and height (8x2)  
+    mov  (8) mudMSGHDR_DN<1>     rMSGSRC<8;8,1>:ud
+    send (4) udBOT_U_IO(0)<1>       mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_U:ud
+    send (4) udBOT_V_IO(0)<1>       mMSGHDR_DN    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_V:ud
+
+ 		//Write UV through DATAPORT
+		mov (2)     rMSGSRC.0<1>:ud        wORIX<2;2,1>:w               // X origin and Y origin
+		asr  (2)    rMSGSRC.0<1>:d     wORIX<2;2,1>:w    1:w   // U/V block origin should be half of Y's
+    mov (1)     rMSGSRC.2<1>:ud        nDPR_BLOCK_SIZE_UV:ud        // block width and height (8x2)
+    mov (8)     mudMSGHDR_DN(0)<1>     rMSGSRC.0<8;8,1>:ud
+    mov (4)			mudMSGHDR_DN(1)<1>		 udBOT_U_IO(0)<4;4,1>
+    send (4)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_U:ud
+    mov (4)			mudMSGHDR_DN(1)<1>		 udBOT_V_IO(0)<4;4,1>
+    send (4)    dNULLREG    mMSGHDR_DN   udDUMMY_NULL    nDATAPORT_WRITE    nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_V:ud
+\ No newline at end of file
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm
new file mode 100644
index 0000000..0b9aa4c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm
@@ -0,0 +1,35 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+#define DI_DISABLE
+
+#include "DNDI.inc"
+
+#undef  nY_NUM_OF_ROWS
+#define nY_NUM_OF_ROWS         8                                 // Number of Y rows per block
+
+#undef   nSMPL_RESP_LEN
+#define  nSMPL_RESP_LEN        nSMPL_RESP_LEN_DN_PL              // Set the Number of GRFs in DNDI response 
+#undef   nDPW_BLOCK_SIZE_DN
+#define  nDPW_BLOCK_SIZE_DN    nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8   // DN Curr Block Size for Write is 16x8
+#undef   nDPW_BLOCK_SIZE_HIST
+#define  nDPW_BLOCK_SIZE_HIST  nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2    // HIST Block Size for Write is 4x2
+
+////////////////////////////////////// Run the DN Algorithm ///////////////////////////////////////
+#include "DNDI_COMMAND.asm"
+
+////////////////////////////////////// Rearrange for Internal Planar //////////////////////////////
+$for (0; <nY_NUM_OF_ROWS; 1) {
+    mov (16)    uwDEST_Y(0,%1*16)<1>   ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1>       // copy line of Y
+}
+
+////////////////////////////////////// Save the History Data for Next Run /////////////////////////
+#include "DNDI_Hist_Save.asm"
+
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm
new file mode 100644
index 0000000..efc7cd6
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm
@@ -0,0 +1,33 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- RGB_AVS_IEF_16x8.asm ----------
+
+#include "AVS_IEF.inc"
+
+//------------------------------------------------------------------------------
+// 2 sampler reads for 8x8 ARGB packed
+//------------------------------------------------------------------------------
+
+    // 1st 8x8 setup
+    #include "AVS_SetupFirstBlock.asm"
+
+    mov (1)  rAVS_8x8_HDR.2:ud      nAVS_ALL_CHANNELS:ud   // Enable ARGB channels
+    mov (16) mAVS_8x8_HDR.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE(0)<1>   mAVS_8x8_HDR    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV
+    // Return ARGB in 16 GRFs
+
+    // 2nd 8x8 setup
+    #include "AVS_SetupSecondBlock.asm"
+    mov (16) mAVS_8x8_HDR_2.0:ud      rAVS_8x8_HDR.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+    send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2    udDUMMY_NULL    nSMPL_ENGINE    nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV
+    // Return ARGB in 16 GRFs
+
+        
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm
new file mode 100644
index 0000000..6e2de97
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm
@@ -0,0 +1,251 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- RGB_AVS_IEF_Unpack_16x8.asm ----------
+#include "AVS_IEF.inc"
+
+#ifdef AVS_OUTPUT_16_BIT
+// Move first 8x8 words of B to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(12,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(12,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(12,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(12,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(13,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(13,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(13,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(13,12)<4;4,1>                                   
+
+// Move first 8x8 words of G to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(11,12)<4;4,1>                                   
+
+// Move first 8x8 words of R to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>                                   
+
+// Move first 8x8 words of A to dest GRF (as packed)
+    mov (4) uwDEST_Y(0,3)<4>       uwAVS_RESPONSE(6,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(1,3)<4>       uwAVS_RESPONSE(6,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(4,3)<4>       uwAVS_RESPONSE(6,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(5,3)<4>       uwAVS_RESPONSE(6,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(8,3)<4>       uwAVS_RESPONSE(7,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(9,3)<4>       uwAVS_RESPONSE(7,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(12,3)<4>      uwAVS_RESPONSE(7,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(13,3)<4>      uwAVS_RESPONSE(7,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(16,3)<4>      uwAVS_RESPONSE(14,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(17,3)<4>      uwAVS_RESPONSE(14,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(20,3)<4>      uwAVS_RESPONSE(14,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(21,3)<4>      uwAVS_RESPONSE(14,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(24,3)<4>      uwAVS_RESPONSE(15,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(25,3)<4>      uwAVS_RESPONSE(15,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(28,3)<4>      uwAVS_RESPONSE(15,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(29,3)<4>      uwAVS_RESPONSE(15,12)<4;4,1>                                   
+
+// Move second 8x8 words of B to dest GRF
+    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(12,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(12,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(12,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(12,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(13,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(13,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(13,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(13,12)<4;4,1>                                   
+
+// Move second 8x8 words of G to dest GRF
+    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>                                   
+
+// Move second 8x8 words of R to dest GRF
+    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>                                   
+
+// Move second 8x8 words of A to dest GRF
+    mov (4) uwDEST_Y(2,3)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(3,3)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(6,3)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(7,3)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(10,3)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>                                      
+    mov (4) uwDEST_Y(11,3)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>                                      
+    mov (4) uwDEST_Y(14,3)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>                                    
+    mov (4) uwDEST_Y(15,3)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>                                    
+    mov (4) uwDEST_Y(18,3)<4>      uwAVS_RESPONSE_2(14,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(19,3)<4>      uwAVS_RESPONSE_2(14,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(22,3)<4>      uwAVS_RESPONSE_2(14,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(23,3)<4>      uwAVS_RESPONSE_2(14,12)<4;4,1>                                   
+    mov (4) uwDEST_Y(26,3)<4>      uwAVS_RESPONSE_2(15,0)<4;4,1>                                     
+    mov (4) uwDEST_Y(27,3)<4>      uwAVS_RESPONSE_2(15,8)<4;4,1>                                     
+    mov (4) uwDEST_Y(30,3)<4>      uwAVS_RESPONSE_2(15,4)<4;4,1>                                   
+    mov (4) uwDEST_Y(31,3)<4>      uwAVS_RESPONSE_2(15,12)<4;4,1>                                   
+
+#else   /* OUTPUT_8_BIT */
+// Move first 8x8 words of B to dest GRF
+    mov (8) ubDEST_Y(0,2)<4>       ubAVS_RESPONSE(4,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(2,2)<4>       ubAVS_RESPONSE(4,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(4,2)<4>       ubAVS_RESPONSE(5,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(6,2)<4>       ubAVS_RESPONSE(5,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(8,2)<4>       ubAVS_RESPONSE(12,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(10,2)<4>      ubAVS_RESPONSE(12,8+1)<16;4,2>                                   
+    mov (8) ubDEST_Y(12,2)<4>      ubAVS_RESPONSE(13,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(14,2)<4>      ubAVS_RESPONSE(13,8+1)<16;4,2>                                   
+
+// Move first 8x8 words of G to dest GRF
+    mov (8) ubDEST_Y(0,1)<4>       ubAVS_RESPONSE(2,1)<16;4,2>              
+    mov (8) ubDEST_Y(2,1)<4>       ubAVS_RESPONSE(2,8+1)<16;4,2>            
+    mov (8) ubDEST_Y(4,1)<4>       ubAVS_RESPONSE(3,1)<16;4,2>              
+    mov (8) ubDEST_Y(6,1)<4>       ubAVS_RESPONSE(3,8+1)<16;4,2>            
+    mov (8) ubDEST_Y(8,1)<4>       ubAVS_RESPONSE(10,1)<16;4,2>             
+    mov (8) ubDEST_Y(10,1)<4>      ubAVS_RESPONSE(10,8+1)<16;4,2>           
+    mov (8) ubDEST_Y(12,1)<4>      ubAVS_RESPONSE(11,1)<16;4,2>             
+    mov (8) ubDEST_Y(14,1)<4>      ubAVS_RESPONSE(11,8+1)<16;4,2>           
+
+// Move first 8x8 words of R to dest GRF
+    mov (8) ubDEST_Y(0,0)<4>       ubAVS_RESPONSE(0,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(2,0)<4>       ubAVS_RESPONSE(0,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(4,0)<4>       ubAVS_RESPONSE(1,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(6,0)<4>       ubAVS_RESPONSE(1,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(8,0)<4>       ubAVS_RESPONSE(8,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(10,0)<4>      ubAVS_RESPONSE(8,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(12,0)<4>      ubAVS_RESPONSE(9,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(14,0)<4>      ubAVS_RESPONSE(9,8+1)<16;4,2>                                    
+
+// Move first 8x8 words of A to dest GRF
+    mov (8) ubDEST_Y(0,3)<4>       ubAVS_RESPONSE(6,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(2,3)<4>       ubAVS_RESPONSE(6,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(4,3)<4>       ubAVS_RESPONSE(7,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(6,3)<4>       ubAVS_RESPONSE(7,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(8,3)<4>       ubAVS_RESPONSE(14,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(10,3)<4>      ubAVS_RESPONSE(14,8+1)<16;4,2>                                   
+    mov (8) ubDEST_Y(12,3)<4>      ubAVS_RESPONSE(15,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(14,3)<4>      ubAVS_RESPONSE(15,8+1)<16;4,2>                                   
+
+// Move second 8x8 words of B to dest GRF
+    mov (8) ubDEST_Y(1,2)<4>       ubAVS_RESPONSE_2(4,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(3,2)<4>       ubAVS_RESPONSE_2(4,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(5,2)<4>       ubAVS_RESPONSE_2(5,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(7,2)<4>       ubAVS_RESPONSE_2(5,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(9,2)<4>       ubAVS_RESPONSE_2(12,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(11,2)<4>      ubAVS_RESPONSE_2(12,8+1)<16;4,2>                                   
+    mov (8) ubDEST_Y(13,2)<4>      ubAVS_RESPONSE_2(13,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(15,2)<4>      ubAVS_RESPONSE_2(13,8+1)<16;4,2>                                   
+
+// Move second 8x8 words of G to dest GRF
+    mov (8) ubDEST_Y(1,1)<4>       ubAVS_RESPONSE_2(2,1)<16;4,2>              
+    mov (8) ubDEST_Y(3,1)<4>       ubAVS_RESPONSE_2(2,8+1)<16;4,2>            
+    mov (8) ubDEST_Y(5,1)<4>       ubAVS_RESPONSE_2(3,1)<16;4,2>              
+    mov (8) ubDEST_Y(7,1)<4>       ubAVS_RESPONSE_2(3,8+1)<16;4,2>            
+    mov (8) ubDEST_Y(9,1)<4>       ubAVS_RESPONSE_2(10,1)<16;4,2>             
+    mov (8) ubDEST_Y(11,1)<4>      ubAVS_RESPONSE_2(10,8+1)<16;4,2>           
+    mov (8) ubDEST_Y(13,1)<4>      ubAVS_RESPONSE_2(11,1)<16;4,2>             
+    mov (8) ubDEST_Y(15,1)<4>      ubAVS_RESPONSE_2(11,8+1)<16;4,2>           
+
+// Move second 8x8 words of R to dest GRF
+    mov (8) ubDEST_Y(1,0)<4>       ubAVS_RESPONSE_2(0,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(3,0)<4>       ubAVS_RESPONSE_2(0,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(5,0)<4>       ubAVS_RESPONSE_2(1,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(7,0)<4>       ubAVS_RESPONSE_2(1,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(9,0)<4>       ubAVS_RESPONSE_2(8,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(11,0)<4>      ubAVS_RESPONSE_2(8,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(13,0)<4>      ubAVS_RESPONSE_2(9,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(15,0)<4>      ubAVS_RESPONSE_2(9,8+1)<16;4,2>                                    
+
+// Move second 8x8 words of A to dest GRF
+    mov (8) ubDEST_Y(1,3)<4>       ubAVS_RESPONSE_2(6,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(3,3)<4>       ubAVS_RESPONSE_2(6,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(5,3)<4>       ubAVS_RESPONSE_2(7,1)<16;4,2>                                      
+    mov (8) ubDEST_Y(7,3)<4>       ubAVS_RESPONSE_2(7,8+1)<16;4,2>                                    
+    mov (8) ubDEST_Y(9,3)<4>       ubAVS_RESPONSE_2(14,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(11,3)<4>      ubAVS_RESPONSE_2(14,8+1)<16;4,2>                                   
+    mov (8) ubDEST_Y(13,3)<4>      ubAVS_RESPONSE_2(15,1)<16;4,2>                                     
+    mov (8) ubDEST_Y(15,3)<4>      ubAVS_RESPONSE_2(15,8+1)<16;4,2>                                   
+#endif
+//------------------------------------------------------------------------------
+
+    // Set to write bottom region to memory
+    #define SRC_REGION                              REGION_2
+ 
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+       
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
+        
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm
new file mode 100644
index 0000000..b81923f
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm
@@ -0,0 +1,260 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- RGB_AVS_IEF_Unpack_16x8.asm ----------
+#include "AVS_IEF.inc"
+
+.declare DEST_B		Base=REG(r,10)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_G		Base=REG(r,18)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_R		Base=REG(r,26)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_A		Base=REG(r,34)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+
+
+#ifdef AVS_OUTPUT_16_BIT
+//This portion will need to be changed if unpacking is required for Y416 kernels (in case of blending etc) - vK
+
+//// Move first 8x8 words of B to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,2)<4>       uwAVS_RESPONSE(4,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(1,2)<4>       uwAVS_RESPONSE(4,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(4,2)<4>       uwAVS_RESPONSE(4,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(5,2)<4>       uwAVS_RESPONSE(4,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(8,2)<4>       uwAVS_RESPONSE(5,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(9,2)<4>       uwAVS_RESPONSE(5,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(12,2)<4>      uwAVS_RESPONSE(5,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(13,2)<4>      uwAVS_RESPONSE(5,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(16,2)<4>      uwAVS_RESPONSE(12,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(17,2)<4>      uwAVS_RESPONSE(12,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(20,2)<4>      uwAVS_RESPONSE(12,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(21,2)<4>      uwAVS_RESPONSE(12,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(24,2)<4>      uwAVS_RESPONSE(13,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(25,2)<4>      uwAVS_RESPONSE(13,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(28,2)<4>      uwAVS_RESPONSE(13,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(29,2)<4>      uwAVS_RESPONSE(13,12)<4;4,1>                                   
+//
+//// Move first 8x8 words of G to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,1)<4>       uwAVS_RESPONSE(2,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(1,1)<4>       uwAVS_RESPONSE(2,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(4,1)<4>       uwAVS_RESPONSE(2,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(5,1)<4>       uwAVS_RESPONSE(2,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(8,1)<4>       uwAVS_RESPONSE(3,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(9,1)<4>       uwAVS_RESPONSE(3,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(12,1)<4>      uwAVS_RESPONSE(3,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(13,1)<4>      uwAVS_RESPONSE(3,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(16,1)<4>      uwAVS_RESPONSE(10,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(17,1)<4>      uwAVS_RESPONSE(10,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(20,1)<4>      uwAVS_RESPONSE(10,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(21,1)<4>      uwAVS_RESPONSE(10,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(24,1)<4>      uwAVS_RESPONSE(11,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(25,1)<4>      uwAVS_RESPONSE(11,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(28,1)<4>      uwAVS_RESPONSE(11,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(29,1)<4>      uwAVS_RESPONSE(11,12)<4;4,1>                                   
+//
+//// Move first 8x8 words of R to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,0)<4>       uwAVS_RESPONSE(0,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(1,0)<4>       uwAVS_RESPONSE(0,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(4,0)<4>       uwAVS_RESPONSE(0,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(5,0)<4>       uwAVS_RESPONSE(0,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(8,0)<4>       uwAVS_RESPONSE(1,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(9,0)<4>       uwAVS_RESPONSE(1,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(12,0)<4>      uwAVS_RESPONSE(1,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(13,0)<4>      uwAVS_RESPONSE(1,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(16,0)<4>      uwAVS_RESPONSE(8,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(17,0)<4>      uwAVS_RESPONSE(8,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(20,0)<4>      uwAVS_RESPONSE(8,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(21,0)<4>      uwAVS_RESPONSE(8,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(24,0)<4>      uwAVS_RESPONSE(9,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(25,0)<4>      uwAVS_RESPONSE(9,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(28,0)<4>      uwAVS_RESPONSE(9,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(29,0)<4>      uwAVS_RESPONSE(9,12)<4;4,1>                                   
+//
+//// Move first 8x8 words of A to dest GRF (as packed)
+//    mov (4) uwDEST_Y(0,3)<4>       uwAVS_RESPONSE(6,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(1,3)<4>       uwAVS_RESPONSE(6,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(4,3)<4>       uwAVS_RESPONSE(6,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(5,3)<4>       uwAVS_RESPONSE(6,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(8,3)<4>       uwAVS_RESPONSE(7,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(9,3)<4>       uwAVS_RESPONSE(7,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(12,3)<4>      uwAVS_RESPONSE(7,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(13,3)<4>      uwAVS_RESPONSE(7,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(16,3)<4>      uwAVS_RESPONSE(14,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(17,3)<4>      uwAVS_RESPONSE(14,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(20,3)<4>      uwAVS_RESPONSE(14,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(21,3)<4>      uwAVS_RESPONSE(14,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(24,3)<4>      uwAVS_RESPONSE(15,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(25,3)<4>      uwAVS_RESPONSE(15,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(28,3)<4>      uwAVS_RESPONSE(15,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(29,3)<4>      uwAVS_RESPONSE(15,12)<4;4,1>                                   
+//
+//// Move second 8x8 words of B to dest GRF
+//    mov (4) uwDEST_Y(2,2)<4>       uwAVS_RESPONSE_2(4,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(3,2)<4>       uwAVS_RESPONSE_2(4,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(6,2)<4>       uwAVS_RESPONSE_2(4,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(7,2)<4>       uwAVS_RESPONSE_2(4,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(10,2)<4>      uwAVS_RESPONSE_2(5,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(11,2)<4>      uwAVS_RESPONSE_2(5,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(14,2)<4>      uwAVS_RESPONSE_2(5,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(15,2)<4>      uwAVS_RESPONSE_2(5,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(18,2)<4>      uwAVS_RESPONSE_2(12,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(19,2)<4>      uwAVS_RESPONSE_2(12,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(22,2)<4>      uwAVS_RESPONSE_2(12,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(23,2)<4>      uwAVS_RESPONSE_2(12,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(26,2)<4>      uwAVS_RESPONSE_2(13,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(27,2)<4>      uwAVS_RESPONSE_2(13,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(30,2)<4>      uwAVS_RESPONSE_2(13,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(31,2)<4>      uwAVS_RESPONSE_2(13,12)<4;4,1>                                   
+//
+//// Move second 8x8 words of G to dest GRF
+//    mov (4) uwDEST_Y(2,1)<4>       uwAVS_RESPONSE_2(2,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(3,1)<4>       uwAVS_RESPONSE_2(2,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(6,1)<4>       uwAVS_RESPONSE_2(2,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(7,1)<4>       uwAVS_RESPONSE_2(2,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(10,1)<4>      uwAVS_RESPONSE_2(3,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(11,1)<4>      uwAVS_RESPONSE_2(3,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(14,1)<4>      uwAVS_RESPONSE_2(3,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(15,1)<4>      uwAVS_RESPONSE_2(3,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(18,1)<4>      uwAVS_RESPONSE_2(10,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(19,1)<4>      uwAVS_RESPONSE_2(10,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(22,1)<4>      uwAVS_RESPONSE_2(10,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(23,1)<4>      uwAVS_RESPONSE_2(10,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(26,1)<4>      uwAVS_RESPONSE_2(11,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(27,1)<4>      uwAVS_RESPONSE_2(11,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(30,1)<4>      uwAVS_RESPONSE_2(11,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(31,1)<4>      uwAVS_RESPONSE_2(11,12)<4;4,1>                                   
+//
+//// Move second 8x8 words of R to dest GRF
+//    mov (4) uwDEST_Y(2,0)<4>       uwAVS_RESPONSE_2(0,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(3,0)<4>       uwAVS_RESPONSE_2(0,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(6,0)<4>       uwAVS_RESPONSE_2(0,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(7,0)<4>       uwAVS_RESPONSE_2(0,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(10,0)<4>      uwAVS_RESPONSE_2(1,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(11,0)<4>      uwAVS_RESPONSE_2(1,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(14,0)<4>      uwAVS_RESPONSE_2(1,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(15,0)<4>      uwAVS_RESPONSE_2(1,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(18,0)<4>      uwAVS_RESPONSE_2(8,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(19,0)<4>      uwAVS_RESPONSE_2(8,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(22,0)<4>      uwAVS_RESPONSE_2(8,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(23,0)<4>      uwAVS_RESPONSE_2(8,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(26,0)<4>      uwAVS_RESPONSE_2(9,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(27,0)<4>      uwAVS_RESPONSE_2(9,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(30,0)<4>      uwAVS_RESPONSE_2(9,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(31,0)<4>      uwAVS_RESPONSE_2(9,12)<4;4,1>                                   
+//
+//// Move second 8x8 words of A to dest GRF
+//    mov (4) uwDEST_Y(2,3)<4>       uwAVS_RESPONSE_2(6,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(3,3)<4>       uwAVS_RESPONSE_2(6,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(6,3)<4>       uwAVS_RESPONSE_2(6,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(7,3)<4>       uwAVS_RESPONSE_2(6,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(10,3)<4>      uwAVS_RESPONSE_2(7,0)<4;4,1>                                      
+//    mov (4) uwDEST_Y(11,3)<4>      uwAVS_RESPONSE_2(7,8)<4;4,1>                                      
+//    mov (4) uwDEST_Y(14,3)<4>      uwAVS_RESPONSE_2(7,4)<4;4,1>                                    
+//    mov (4) uwDEST_Y(15,3)<4>      uwAVS_RESPONSE_2(7,12)<4;4,1>                                    
+//    mov (4) uwDEST_Y(18,3)<4>      uwAVS_RESPONSE_2(14,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(19,3)<4>      uwAVS_RESPONSE_2(14,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(22,3)<4>      uwAVS_RESPONSE_2(14,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(23,3)<4>      uwAVS_RESPONSE_2(14,12)<4;4,1>                                   
+//    mov (4) uwDEST_Y(26,3)<4>      uwAVS_RESPONSE_2(15,0)<4;4,1>                                     
+//    mov (4) uwDEST_Y(27,3)<4>      uwAVS_RESPONSE_2(15,8)<4;4,1>                                     
+//    mov (4) uwDEST_Y(30,3)<4>      uwAVS_RESPONSE_2(15,4)<4;4,1>                                   
+//    mov (4) uwDEST_Y(31,3)<4>      uwAVS_RESPONSE_2(15,12)<4;4,1>                                   
+
+#else   /* OUTPUT_8_BIT */
+
+// Move first 8x8 words of B to dest GRF
+    mov (8) DEST_B(0)<1>				ubAVS_RESPONSE(4,1)<16;4,2>                                      
+    mov (8) DEST_B(1)<1>				ubAVS_RESPONSE(4,8+1)<16;4,2>                                    
+    mov (8) DEST_B(2)<1>				ubAVS_RESPONSE(5,1)<16;4,2>                                      
+    mov (8) DEST_B(3)<1>				ubAVS_RESPONSE(5,8+1)<16;4,2>                                    
+    mov (8) DEST_B(4)<1>				ubAVS_RESPONSE(12,1)<16;4,2>                                     
+    mov (8) DEST_B(5)<1>				ubAVS_RESPONSE(12,8+1)<16;4,2>                                   
+    mov (8) DEST_B(6)<1>				ubAVS_RESPONSE(13,1)<16;4,2>                                     
+    mov (8) DEST_B(7)<1>				ubAVS_RESPONSE(13,8+1)<16;4,2>                                   
+
+// Move first 8x8 words of G to dest GRF
+    mov (8) DEST_G(0)<1>				ubAVS_RESPONSE(2,1)<16;4,2>              
+    mov (8) DEST_G(1)<1>				ubAVS_RESPONSE(2,8+1)<16;4,2>            
+    mov (8) DEST_G(2)<1>				ubAVS_RESPONSE(3,1)<16;4,2>              
+    mov (8) DEST_G(3)<1>				ubAVS_RESPONSE(3,8+1)<16;4,2>            
+    mov (8) DEST_G(4)<1>				ubAVS_RESPONSE(10,1)<16;4,2>             
+    mov (8) DEST_G(5)<1>				ubAVS_RESPONSE(10,8+1)<16;4,2>           
+    mov (8) DEST_G(6)<1>				ubAVS_RESPONSE(11,1)<16;4,2>             
+    mov (8) DEST_G(7)<1>				ubAVS_RESPONSE(11,8+1)<16;4,2>           
+
+// Move first 8x8 words of R to dest GRF
+    mov (8) DEST_R(0)<1>				ubAVS_RESPONSE(0,1)<16;4,2>                                      
+    mov (8) DEST_R(1)<1>				ubAVS_RESPONSE(0,8+1)<16;4,2>                                    
+    mov (8) DEST_R(2)<1>				ubAVS_RESPONSE(1,1)<16;4,2>                                      
+    mov (8) DEST_R(3)<1>				ubAVS_RESPONSE(1,8+1)<16;4,2>                                    
+    mov (8) DEST_R(4)<1>				ubAVS_RESPONSE(8,1)<16;4,2>                                      
+    mov (8) DEST_R(5)<1>				ubAVS_RESPONSE(8,8+1)<16;4,2>                                    
+    mov (8) DEST_R(6)<1>				ubAVS_RESPONSE(9,1)<16;4,2>                                      
+    mov (8) DEST_R(7)<1>				ubAVS_RESPONSE(9,8+1)<16;4,2>                                    
+
+// Move first 8x8 words of A to dest GRF
+    mov (8) DEST_A(0)<1>				ubAVS_RESPONSE(6,1)<16;4,2>                                      
+    mov (8) DEST_A(1)<1>				ubAVS_RESPONSE(6,8+1)<16;4,2>                                    
+    mov (8) DEST_A(2)<1>				ubAVS_RESPONSE(7,1)<16;4,2>                                      
+    mov (8) DEST_A(3)<1>				ubAVS_RESPONSE(7,8+1)<16;4,2>                                    
+    mov (8) DEST_A(4)<1>				ubAVS_RESPONSE(14,1)<16;4,2>                                     
+    mov (8) DEST_A(5)<1>				ubAVS_RESPONSE(14,8+1)<16;4,2>                                   
+    mov (8) DEST_A(6)<1>				ubAVS_RESPONSE(15,1)<16;4,2>                                     
+    mov (8) DEST_A(7)<1>				ubAVS_RESPONSE(15,8+1)<16;4,2>                                   
+
+// Move second 8x8 words of B to dest GRF
+    mov (8) DEST_B(0,8)<1>			ubAVS_RESPONSE_2(4,1)<16;4,2>                                      
+    mov (8) DEST_B(1,8)<1>			ubAVS_RESPONSE_2(4,8+1)<16;4,2>                                    
+    mov (8) DEST_B(2,8)<1>			ubAVS_RESPONSE_2(5,1)<16;4,2>                                      
+    mov (8) DEST_B(3,8)<1>			ubAVS_RESPONSE_2(5,8+1)<16;4,2>                                    
+    mov (8) DEST_B(4,8)<1>			ubAVS_RESPONSE_2(12,1)<16;4,2>                                     
+    mov (8) DEST_B(5,8)<1>			ubAVS_RESPONSE_2(12,8+1)<16;4,2>                                   
+    mov (8) DEST_B(6,8)<1>			ubAVS_RESPONSE_2(13,1)<16;4,2>                                     
+    mov (8) DEST_B(7,8)<1>			ubAVS_RESPONSE_2(13,8+1)<16;4,2>                                   
+
+// Move second 8x8 words of G to dest GRF
+    mov (8) DEST_G(0,8)<1>			ubAVS_RESPONSE_2(2,1)<16;4,2>              
+    mov (8) DEST_G(1,8)<1>			ubAVS_RESPONSE_2(2,8+1)<16;4,2>            
+    mov (8) DEST_G(2,8)<1>			ubAVS_RESPONSE_2(3,1)<16;4,2>              
+    mov (8) DEST_G(3,8)<1>			ubAVS_RESPONSE_2(3,8+1)<16;4,2>            
+    mov (8) DEST_G(4,8)<1>			ubAVS_RESPONSE_2(10,1)<16;4,2>             
+    mov (8) DEST_G(5,8)<1>			ubAVS_RESPONSE_2(10,8+1)<16;4,2>           
+    mov (8) DEST_G(6,8)<1>			ubAVS_RESPONSE_2(11,1)<16;4,2>             
+    mov (8) DEST_G(7,8)<1>			ubAVS_RESPONSE_2(11,8+1)<16;4,2>           
+
+// Move second 8x8 words of R to dest GRF
+    mov (8) DEST_R(0,8)<1>			ubAVS_RESPONSE_2(0,1)<16;4,2>                                      
+    mov (8) DEST_R(1,8)<1>			ubAVS_RESPONSE_2(0,8+1)<16;4,2>                                    
+    mov (8) DEST_R(2,8)<1>			ubAVS_RESPONSE_2(1,1)<16;4,2>                                      
+    mov (8) DEST_R(3,8)<1>			ubAVS_RESPONSE_2(1,8+1)<16;4,2>                                    
+    mov (8) DEST_R(4,8)<1>			ubAVS_RESPONSE_2(8,1)<16;4,2>                                      
+    mov (8) DEST_R(5,8)<1>			ubAVS_RESPONSE_2(8,8+1)<16;4,2>                                    
+    mov (8) DEST_R(6,8)<1>			ubAVS_RESPONSE_2(9,1)<16;4,2>                                      
+    mov (8) DEST_R(7,8)<1>			ubAVS_RESPONSE_2(9,8+1)<16;4,2>                                    
+
+// Move second 8x8 words of A to dest GRF
+    mov (8) DEST_A(0,8)<1>			ubAVS_RESPONSE_2(6,1)<16;4,2>                                      
+    mov (8) DEST_A(1,8)<1>			ubAVS_RESPONSE_2(6,8+1)<16;4,2>                                    
+    mov (8) DEST_A(2,8)<1>			ubAVS_RESPONSE_2(7,1)<16;4,2>                                      
+    mov (8) DEST_A(3,8)<1>			ubAVS_RESPONSE_2(7,8+1)<16;4,2>                                    
+    mov (8) DEST_A(4,8)<1>			ubAVS_RESPONSE_2(14,1)<16;4,2>                                     
+    mov (8) DEST_A(5,8)<1>			ubAVS_RESPONSE_2(14,8+1)<16;4,2>                                   
+    mov (8) DEST_A(6,8)<1>			ubAVS_RESPONSE_2(15,1)<16;4,2>                                     
+    mov (8) DEST_A(7,8)<1>			ubAVS_RESPONSE_2(15,8+1)<16;4,2>                                   
+#endif
+//------------------------------------------------------------------------------
+
+    // Set to write bottom region to memory
+    #define SRC_REGION                              REGION_1
+ 
+    // Re-define new # of lines
+    #undef nUV_NUM_OF_ROWS
+    #undef nY_NUM_OF_ROWS
+       
+    #define nY_NUM_OF_ROWS      8
+    #define nUV_NUM_OF_ROWS     8
+        
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm
new file mode 100644
index 0000000..7429790
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm
@@ -0,0 +1,72 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+//---------- RGB_Scaling.asm ----------
+#include "Scaling.inc"
+
+	// Build 16 elements ramp in float32 and normalized it
+//	mov (8)		SAMPLER_RAMP(0)<1>		0x76543210:v
+//	add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf		//3, 2, 1, 0 in float vector
+mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf	//7, 6, 5, 4 in float vector
+add	(8)		SAMPLER_RAMP(1)<1>		SAMPLER_RAMP(0)	8.0:f
+
+//Module: PrepareScaleCoord.asm
+
+	// Setup for sampler msg hdr
+    mov (2)		rMSGSRC.0<1>:ud			0:ud						{ NoDDClr }	// Unused fields
+    mov (1)		rMSGSRC.2<1>:ud			0:ud						{ NoDDChk }	// Write and offset
+
+	// Calculate 16 v based on the step Y and vertical origin
+	mov	(16)	mfMSGPAYLOAD(2)<1>		fSRC_VID_V_ORI<0;1,0>:f
+	mov	(16)	SCALE_COORD_Y<1>:f		fSRC_VID_V_ORI<0;1,0>:f
+
+	// Calculate 16 u based on the step X and hori origin
+//	line (16)	mfMSGPAYLOAD(0)<1>		SCALE_STEP_X<0;1,0>:f		SAMPLER_RAMP(0) 	// Assign to mrf directly
+	mov	(16)	acc0:f							fSRC_VID_H_ORI<0;1,0>:f											{ Compr }
+	mac	(16)	mfMSGPAYLOAD(0)<1>	fVIDEO_STEP_X<0;1,0>:f	SAMPLER_RAMP(0)			{ Compr }			
+
+	//Setup the constants for line instruction
+	mov 	(1)		SCALE_LINE_P255<1>:f		255.0:f 			{ NoDDClr }	//{ NoDDClr, NoDDChk }
+	mov 	(1)		SCALE_LINE_P0_5<1>:f		0.5:f 				{ NoDDChk }
+
+	
+//------------------------------------------------------------------------------
+
+$for (0; <nY_NUM_OF_ROWS; 1) {
+
+	// Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA).
+  mov (8) 	MSGHDR_SCALE.0:ud      rMSGSRC.0<8;8,1>:ud    // Copy msg header and payload mirrors to MRFs
+	send (16)	SCALE_RESPONSE_YW(0)<1>		MSGHDR_SCALE	udDUMMY_NULL	nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_RGB+nBI_CURRENT_SRC_RGB
+
+	// Calculate 16 v for next line
+	add (16)	mfMSGPAYLOAD(2)<1>		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+	add (16)	SCALE_COORD_Y<1>:f		SCALE_COORD_Y<8;8,1>:f		fVIDEO_STEP_Y<0;1,0>:f	// Assign to mrf directly
+
+	// Scale back to [0, 255], convert f to ud
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(0)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(0)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(2)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(2)<1>	acc0:f														{ Compr }
+
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(4)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(4)<1>	acc0:f														{ Compr }
+
+//#if defined(SAVE_ARGB)	//Only needed if Alpha value is written to the destination
+	line (16)	acc0:f		SCALE_LINE_P255<0;1,0>:f	SCALE_RESPONSE_YF(6)	{ Compr }			// Process B, V
+	mov  (16) SCALE_RESPONSE_YD(6)<1>	acc0:f														{ Compr }
+//#endif
+
+	mov	 (16) 	DEST_R(%1)<1>				SCALE_RESPONSE_YB(0)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_G(%1)<1>				SCALE_RESPONSE_YB(2)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_B(%1)<1>				SCALE_RESPONSE_YB(4)											//possible error due to truncation - vK
+	mov	 (16) 	DEST_A(%1)<1>				SCALE_RESPONSE_YB(6)											//possible error due to truncation - vK
+}
diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc
new file mode 100644
index 0000000..bf66d4c
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc
@@ -0,0 +1,75 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// File name: Scaling.inc
+
+#ifndef _SCALING_INC_
+#define _SCALING_INC_
+
+// Local variables----------------------------------------------------------------------------------
+#define MSGHDR_SCALE		m1		// Message Payload Header (Uses m2, m3, m4, m5 implicitly)
+
+//--------------------------------------------------------------------------------------------------
+//r10.0 thru r33.0; Primary surface read from sampler (16x8)
+#define DEST_Y			uwTOP_Y
+#define DEST_U			uwTOP_U
+#define DEST_V			uwTOP_V
+
+//r10.0 thru r41.0
+.declare DEST_B		Base=REG(r,10)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_G		Base=REG(r,18)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_R		Base=REG(r,26)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+.declare DEST_A		Base=REG(r,34)	ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
+
+//r56.0 thru r79.0
+.declare	SCALE_RESPONSE_YF  	Base=REG(r,nBOT_Y) 	ElementSize=4 SrcRegion=REGION(8,1) Type=f
+.declare	SCALE_RESPONSE_UF  	Base=REG(r,nBOT_U) 	ElementSize=4 SrcRegion=REGION(8,1) Type=f
+.declare	SCALE_RESPONSE_VF  	Base=REG(r,nBOT_V) 	ElementSize=4 SrcRegion=REGION(8,1) Type=f
+
+.declare	SCALE_RESPONSE_YW  	Base=REG(r,nBOT_Y) 	ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare	SCALE_RESPONSE_UW  	Base=REG(r,nBOT_U) 	ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+.declare	SCALE_RESPONSE_VW  	Base=REG(r,nBOT_V) 	ElementSize=2 SrcRegion=REGION(16,1) Type=uw
+
+.declare	SCALE_RESPONSE_YD  	Base=REG(r,nBOT_Y) 	ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare	SCALE_RESPONSE_UD  	Base=REG(r,nBOT_U) 	ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+.declare	SCALE_RESPONSE_VD  	Base=REG(r,nBOT_V) 	ElementSize=4 SrcRegion=REGION(8,1) Type=ud
+
+.declare	SCALE_RESPONSE_YB  	Base=REG(r,nBOT_Y) 	ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare	SCALE_RESPONSE_UB  	Base=REG(r,nBOT_U) 	ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+.declare	SCALE_RESPONSE_VB  	Base=REG(r,nBOT_V) 	ElementSize=1 SrcRegion=REGION(8,4) Type=ub
+
+.declare	SAMPLER_RAMP   	Base=REG(r,42) ElementSize=4 SrcRegion=<8;8,1> Type=f	// 2 GRFs, 16 elements
+
+//#define	SCALE_STEP_X	REG2(r,43,0)
+//#define	SCALE_COORD_X	REG2(r,43,3)
+
+#define SCALE_LINE_P255			REG2(r,43,4)	// = 255.0	Used in 'line' inst to multiply 255, add 0.5, and round to int.
+#define SCALE_LINE_P0_5			REG2(r,43,7)	// = 0.5
+
+//r44.0 thru r45.0
+#define SCALE_COORD_Y		REG(r,44)	//2GRF
+
+
+// Send Message [DevILK]                                Message Descriptor
+//  MBZ MsgL=5 MsgR=8                            H MBZ   SIMD     MsgType   SmplrIndx BindTab
+//  000 0 101 0 1000                             1  0     10     0000         0000    00000000
+//    0     A    8                                     A             0             0     0     0
+//     MsgL=1+2*2(u,v)=5 MsgR=8
+#define SMPLR_MSG_DSC		0x0A8A0000	// ILK Sampler Message Descriptor
+
+// Re-define new number of lines
+#undef nY_NUM_OF_ROWS
+#undef nUV_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS      8
+#define nUV_NUM_OF_ROWS     8
+
+
+#endif 	//_SCALING_INC_
diff --git a/i965_drv_video/shaders/post_processing/Makefile.am b/i965_drv_video/shaders/post_processing/Makefile.am
new file mode 100644
index 0000000..9f97eb0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/Makefile.am
@@ -0,0 +1,28 @@
+
+INTEL_G4I = 
+
+INTEL_G4A = null.g4a
+
+INTEL_G4B = null.g4b
+
+INTEL_G4B_GEN5 = null.g4b.gen5
+
+EXTRA_DIST = $(INTEL_G4I)	\
+	     $(INTEL_G4A)       \
+	     $(INTEL_G4B)    	\
+	     $(INTEL_G4B_GEN5)
+
+if HAVE_GEN4ASM
+
+SUFFIXES = .g4a .g4b
+.g4a.g4b:
+	m4 $*.g4a > $*.g4m && intel-gen4asm -o $@ $*.g4m && intel-gen4asm -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+
+$(INTEL_G4B): $(INTEL_G4I)
+
+BUILT_SOURCES= $(INTEL_G4B)
+
+clean-local:
+	-rm -f $(INTEL_G4B)
+	-rm -f $(INTEL_G4B_GEN5)
+endif    
diff --git a/i965_drv_video/shaders/post_processing/null.g4a b/i965_drv_video/shaders/post_processing/null.g4a
new file mode 100644
index 0000000..cde124a
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/null.g4a
@@ -0,0 +1,3 @@
+/* Just for test */
+
+send(16) 0 acc0<1>UW g0<8,8,1>UW thread_spawner(0, 0, 0) mlen 1 rlen 0 {align1 EOT};
diff --git a/i965_drv_video/shaders/post_processing/null.g4b b/i965_drv_video/shaders/post_processing/null.g4b
new file mode 100644
index 0000000..d8f28e7
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/null.g4b
@@ -0,0 +1 @@
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x87100000 },
diff --git a/i965_drv_video/shaders/post_processing/null.g4b.gen5 b/i965_drv_video/shaders/post_processing/null.g4b.gen5
new file mode 100644
index 0000000..2bd0ba6
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/null.g4b.gen5
@@ -0,0 +1 @@
+   { 0x00800031, 0x24001d28, 0x748d0000, 0x82000000 },
diff --git a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm
new file mode 100644
index 0000000..80665e0
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm
@@ -0,0 +1,19 @@
+// Module name: NV12_AVS_NV12
+.kernel NV12_AVS_NV12
+.code
+
+#define INC_SCALING
+        
+#include "SetupVPKernel.asm"
+#include "Multiple_Loop_Head.asm"
+#include "PL2_AVS_IEF_16x8.asm"
+#include "PL8x4_Save_NV12.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD  // End of Thread
+
+.end_code  
+
+.end_kernel
+
+// end of nv12_avs_nv12.asm
diff --git a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5
new file mode 100644
index 0000000..b2a9e85
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5
@@ -0,0 +1,162 @@
+   { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 },
+   { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 },
+   { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 },
+   { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 },
+   { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 },
+   { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 },
+   { 0x01000005, 0x20002d2c, 0x00000088, 0x80008000 },
+   { 0x00010001, 0x20c003fd, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x212003bd, 0x000000c0, 0x00000000 },
+   { 0x00000001, 0x212403bd, 0x000000bc, 0x00000000 },
+   { 0x00000001, 0x213403bd, 0x00000038, 0x00000000 },
+   { 0x00200001, 0x612803bd, 0x004500a4, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0000d000 },
+   { 0x00802001, 0x20000022, 0x008d0100, 0x00000000 },
+   { 0x00000031, 0x25401c09, 0x208d0000, 0x044bb401 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0000a000 },
+   { 0x00802001, 0x20400022, 0x008d0100, 0x00000000 },
+   { 0x02000031, 0x25c01c09, 0x208d0000, 0x048bb802 },
+   { 0x00000001, 0x240803bc, 0x000000a4, 0x00000000 },
+   { 0x00000048, 0x24087fbc, 0x000000bc, 0x41000000 },
+   { 0x00000048, 0x21287fbd, 0x000000c0, 0x41e00000 },
+   { 0x00000001, 0x240403bc, 0x000000bc, 0x00000000 },
+   { 0x00000048, 0x21247fbd, 0x000000c0, 0x41000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0000d000 },
+   { 0x00802001, 0x20000022, 0x008d0100, 0x00000000 },
+   { 0x00000031, 0x27401c09, 0x208d0000, 0x044bb401 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0000a000 },
+   { 0x00802001, 0x20400022, 0x008d0100, 0x00000000 },
+   { 0x02000031, 0x27c01c09, 0x208d0000, 0x048bb802 },
+   { 0x00600001, 0x21400229, 0x00aa0541, 0x00000000 },
+   { 0x00600001, 0x21600229, 0x00aa0549, 0x00000000 },
+   { 0x00600001, 0x21800229, 0x00aa0561, 0x00000000 },
+   { 0x00600001, 0x21a00229, 0x00aa0569, 0x00000000 },
+   { 0x00600001, 0x21c00229, 0x00aa0581, 0x00000000 },
+   { 0x00600001, 0x21e00229, 0x00aa0589, 0x00000000 },
+   { 0x00600001, 0x22000229, 0x00aa05a1, 0x00000000 },
+   { 0x00600001, 0x22200229, 0x00aa05a9, 0x00000000 },
+   { 0x00600001, 0x22400229, 0x00aa05c1, 0x00000000 },
+   { 0x00600001, 0x22600229, 0x00aa05c9, 0x00000000 },
+   { 0x00600001, 0x22800229, 0x00aa05e1, 0x00000000 },
+   { 0x00600001, 0x22a00229, 0x00aa05e9, 0x00000000 },
+   { 0x00600001, 0x22c00229, 0x00aa0641, 0x00000000 },
+   { 0x00600001, 0x22e00229, 0x00aa0649, 0x00000000 },
+   { 0x00600001, 0x23000229, 0x00aa0661, 0x00000000 },
+   { 0x00600001, 0x23200229, 0x00aa0669, 0x00000000 },
+   { 0x00600001, 0x23400229, 0x00aa0601, 0x00000000 },
+   { 0x00600001, 0x23600229, 0x00aa0609, 0x00000000 },
+   { 0x00600001, 0x23800229, 0x00aa0621, 0x00000000 },
+   { 0x00600001, 0x23a00229, 0x00aa0629, 0x00000000 },
+   { 0x00600001, 0x23c00229, 0x00aa0681, 0x00000000 },
+   { 0x00600001, 0x23e00229, 0x00aa0689, 0x00000000 },
+   { 0x00600001, 0x24000229, 0x00aa06a1, 0x00000000 },
+   { 0x00600001, 0x24200229, 0x00aa06a9, 0x00000000 },
+   { 0x00600001, 0x21500229, 0x00aa0741, 0x00000000 },
+   { 0x00600001, 0x21700229, 0x00aa0749, 0x00000000 },
+   { 0x00600001, 0x21900229, 0x00aa0761, 0x00000000 },
+   { 0x00600001, 0x21b00229, 0x00aa0769, 0x00000000 },
+   { 0x00600001, 0x21d00229, 0x00aa0781, 0x00000000 },
+   { 0x00600001, 0x21f00229, 0x00aa0789, 0x00000000 },
+   { 0x00600001, 0x22100229, 0x00aa07a1, 0x00000000 },
+   { 0x00600001, 0x22300229, 0x00aa07a9, 0x00000000 },
+   { 0x00600001, 0x22500229, 0x00aa07c1, 0x00000000 },
+   { 0x00600001, 0x22700229, 0x00aa07c9, 0x00000000 },
+   { 0x00600001, 0x22900229, 0x00aa07e1, 0x00000000 },
+   { 0x00600001, 0x22b00229, 0x00aa07e9, 0x00000000 },
+   { 0x00600001, 0x22d00229, 0x00aa0841, 0x00000000 },
+   { 0x00600001, 0x22f00229, 0x00aa0849, 0x00000000 },
+   { 0x00600001, 0x23100229, 0x00aa0861, 0x00000000 },
+   { 0x00600001, 0x23300229, 0x00aa0869, 0x00000000 },
+   { 0x00600001, 0x23500229, 0x00aa0801, 0x00000000 },
+   { 0x00600001, 0x23700229, 0x00aa0809, 0x00000000 },
+   { 0x00600001, 0x23900229, 0x00aa0821, 0x00000000 },
+   { 0x00600001, 0x23b00229, 0x00aa0829, 0x00000000 },
+   { 0x00600001, 0x23d00229, 0x00aa0881, 0x00000000 },
+   { 0x00600001, 0x23f00229, 0x00aa0889, 0x00000000 },
+   { 0x00600001, 0x24100229, 0x00aa08a1, 0x00000000 },
+   { 0x00600001, 0x24300229, 0x00aa08a9, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x20280062, 0x00000000, 0x0007000f },
+   { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff },
+   { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 },
+   { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 },
+   { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 },
+   { 0x00000801, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 },
+   { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0007000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 },
+   { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 },
+   { 0x00710001, 0x24400169, 0x02000000, 0x00000000 },
+   { 0x00000001, 0x24600061, 0x00000000, 0x00040001 },
+   { 0x00000001, 0x24640061, 0x00000000, 0x00400010 },
+   { 0x00000001, 0x24680061, 0x00000000, 0x04000100 },
+   { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 },
+   { 0x00000001, 0x26020128, 0x00000440, 0x00000000 },
+   { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 },
+   { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 },
+   { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000442, 0x00000000 },
+   { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000444, 0x00000000 },
+   { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 },
+   { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 },
+   { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000446, 0x00000000 },
+   { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000448, 0x00000000 },
+   { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 },
+   { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 },
+   { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 },
+   { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 },
+   { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 },
+   { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 },
+   { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 },
+   { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 },
+   { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 },
+   { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 },
+   { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 },
+   { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 },
+   { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 },
+   { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 },
+   { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 },
+   { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 },
+   { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 },
+   { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 },
+   { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 },
+   { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 },
+   { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff },
+   { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 },
+   { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 },
+   { 0x00000041, 0x24407fbd, 0x000000bc, 0x41800000 },
+   { 0x00000040, 0x20a477bd, 0x00000440, 0x000000a4 },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x0000000e },
+   { 0x00010220, 0x34001c00, 0x02001400, 0xfffffede },
+   { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 },
+   { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 },
+   { 0x00000001, 0x20a403bd, 0x00000094, 0x00000000 },
+   { 0x00000041, 0x24407fbd, 0x00000038, 0x41000000 },
+   { 0x00000040, 0x20a877bd, 0x00000440, 0x000000a8 },
+   { 0x00000220, 0x34001c00, 0x00001400, 0xfffffed2 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
diff --git a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm
new file mode 100644
index 0000000..3ea9cea
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm
@@ -0,0 +1,18 @@
+// Module name: NV12_DNDI_NV12
+.kernel NV12_DNDI_NV12
+.code
+
+#define INC_DNDI
+        
+#include "SetupVPKernel.asm"
+#include "Multiple_Loop_Head.asm"
+#include "PL_DNDI_ALG_UVCopy_NV12.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD  // End of Thread
+
+.end_code  
+
+.end_kernel
+
+// end of nv12_dndi_nv12.asm
diff --git a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5
new file mode 100644
index 0000000..1f60f3f
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5
@@ -0,0 +1,86 @@
+   { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 },
+   { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 },
+   { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 },
+   { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 },
+   { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 },
+   { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00000001, 0x204801aa, 0x000000a0, 0x00000000 },
+   { 0x00000001, 0x205801aa, 0x000000a2, 0x00000000 },
+   { 0x01600031, 0x24400c01, 0x208d0000, 0x04cb8004 },
+   { 0x00800001, 0x21400229, 0x00b10440, 0x00000000 },
+   { 0x00800001, 0x21600229, 0x00b10450, 0x00000000 },
+   { 0x00800001, 0x21800229, 0x00b10460, 0x00000000 },
+   { 0x00800001, 0x21a00229, 0x00b10470, 0x00000000 },
+   { 0x00600001, 0x22400229, 0x00ae0481, 0x00000000 },
+   { 0x00600001, 0x23400229, 0x00ae0480, 0x00000000 },
+   { 0x00600001, 0x22500229, 0x00ae0491, 0x00000000 },
+   { 0x00600001, 0x23500229, 0x00ae0490, 0x00000000 },
+   { 0x00600001, 0x22600229, 0x00ae04a1, 0x00000000 },
+   { 0x00600001, 0x23600229, 0x00ae04a0, 0x00000000 },
+   { 0x00600001, 0x22700229, 0x00ae04b1, 0x00000000 },
+   { 0x00600001, 0x23700229, 0x00ae04b0, 0x00000000 },
+   { 0x00800001, 0x21c00229, 0x00b104c0, 0x00000000 },
+   { 0x00800001, 0x21e00229, 0x00b104d0, 0x00000000 },
+   { 0x00800001, 0x22000229, 0x00b104e0, 0x00000000 },
+   { 0x00800001, 0x22200229, 0x00b104f0, 0x00000000 },
+   { 0x00600001, 0x22800229, 0x00ae0501, 0x00000000 },
+   { 0x00600001, 0x23800229, 0x00ae0500, 0x00000000 },
+   { 0x00600001, 0x22900229, 0x00ae0511, 0x00000000 },
+   { 0x00600001, 0x23900229, 0x00ae0510, 0x00000000 },
+   { 0x00600001, 0x22a00229, 0x00ae0521, 0x00000000 },
+   { 0x00600001, 0x23a00229, 0x00ae0520, 0x00000000 },
+   { 0x00600001, 0x22b00229, 0x00ae0531, 0x00000000 },
+   { 0x00600001, 0x23b00229, 0x00ae0530, 0x00000000 },
+   { 0x00000008, 0x21003da1, 0x000000a0, 0x00010001 },
+   { 0x00000001, 0x210401a1, 0x000000a2, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x00030007 },
+   { 0x00600001, 0x21600022, 0x008d0100, 0x00000000 },
+   { 0x00600001, 0x21800022, 0x008d0540, 0x00000000 },
+   { 0x0b600031, 0x20000c04, 0x508d0000, 0x04082014 },
+   { 0x00200008, 0x21003da1, 0x004500a0, 0x00020002 },
+   { 0x00000040, 0x21002421, 0x00000100, 0x00000034 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x00000003 },
+   { 0x00600001, 0x21a00022, 0x008d0100, 0x00000000 },
+   { 0x00000001, 0x21c00022, 0x00000560, 0x00000000 },
+   { 0x0d600031, 0x20000c04, 0x508d0000, 0x04082014 },
+   { 0x00400040, 0x22083e28, 0x00690024, 0x07000700 },
+   { 0x01000010, 0x20003e2c, 0x0000003b, 0x00010001 },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x0000000a },
+   { 0x00400001, 0x20400022, 0x00690580, 0x00000000 },
+   { 0x00400001, 0x20500022, 0x006904d0, 0x00000000 },
+   { 0x00400001, 0x20600022, 0x00690590, 0x00000000 },
+   { 0x00400001, 0x20700022, 0x006904f0, 0x00000000 },
+   { 0x00000220, 0x34001c00, 0x00001400, 0x00000008 },
+   { 0x00400001, 0x20400022, 0x006904c0, 0x00000000 },
+   { 0x00400001, 0x20500022, 0x00690580, 0x00000000 },
+   { 0x00400001, 0x20600022, 0x006904e0, 0x00000000 },
+   { 0x00400001, 0x20700022, 0x00690590, 0x00000000 },
+   { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x06082007 },
+   { 0x00200040, 0x210035a5, 0x004500a0, 0x00450074 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0001000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x28000c01, 0x408d0000, 0x0218a002 },
+   { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0001000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00600001, 0x20400022, 0x008d0800, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x04082008 },
+   { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff },
+   { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 },
+   { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x00000008 },
+   { 0x00010220, 0x34001c00, 0x02001400, 0xffffff70 },
+   { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 },
+   { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 },
+   { 0x00000220, 0x34001c00, 0x00001400, 0xffffff6a },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
diff --git a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm
new file mode 100644
index 0000000..f234f83
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm
@@ -0,0 +1,17 @@
+// Module name: NV12_LOAD_SAVE_NV12
+.kernel NV12_LOAD_SAVE_NV12
+.code
+
+#include "SetupVPKernel.asm"
+#include "Multiple_Loop_Head.asm"
+#include "NV12_Load_8x4.asm"        
+#include "PL8x4_Save_NV12.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD  // End of Thread
+
+.end_code  
+
+.end_kernel
+
+// end of nv12_load_save_nv12.asm
diff --git a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5
new file mode 100644
index 0000000..9802ff2
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5
@@ -0,0 +1,106 @@
+   { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 },
+   { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 },
+   { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 },
+   { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 },
+   { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 },
+   { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 },
+   { 0x00200040, 0x210035a5, 0x004500a0, 0x00450074 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0007000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a001 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20400022, 0x008d0100, 0x00000000 },
+   { 0x02600031, 0x28000c01, 0x408d0000, 0x0228a002 },
+   { 0x00800001, 0x22200229, 0x00b10770, 0x00000000 },
+   { 0x00800001, 0x22000229, 0x00b10760, 0x00000000 },
+   { 0x00800001, 0x21e00229, 0x00b10750, 0x00000000 },
+   { 0x00800001, 0x21c00229, 0x00b10740, 0x00000000 },
+   { 0x00800001, 0x21a00229, 0x00b10730, 0x00000000 },
+   { 0x00800001, 0x21800229, 0x00b10720, 0x00000000 },
+   { 0x00800001, 0x21600229, 0x00b10710, 0x00000000 },
+   { 0x00800001, 0x21400229, 0x00b10700, 0x00000000 },
+   { 0x00800001, 0x22600229, 0x00d20820, 0x00000000 },
+   { 0x00800001, 0x23600229, 0x00d20821, 0x00000000 },
+   { 0x00800001, 0x22400229, 0x00d20800, 0x00000000 },
+   { 0x00800001, 0x23400229, 0x00d20801, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x20280062, 0x00000000, 0x0007000f },
+   { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff },
+   { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 },
+   { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 },
+   { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 },
+   { 0x00000801, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 },
+   { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0007000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 },
+   { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 },
+   { 0x00710001, 0x24400169, 0x02000000, 0x00000000 },
+   { 0x00000001, 0x24600061, 0x00000000, 0x00040001 },
+   { 0x00000001, 0x24640061, 0x00000000, 0x00400010 },
+   { 0x00000001, 0x24680061, 0x00000000, 0x04000100 },
+   { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 },
+   { 0x00000001, 0x26020128, 0x00000440, 0x00000000 },
+   { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 },
+   { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 },
+   { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000442, 0x00000000 },
+   { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000444, 0x00000000 },
+   { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 },
+   { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 },
+   { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000446, 0x00000000 },
+   { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000448, 0x00000000 },
+   { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 },
+   { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 },
+   { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 },
+   { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 },
+   { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 },
+   { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 },
+   { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 },
+   { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 },
+   { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 },
+   { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 },
+   { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 },
+   { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 },
+   { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 },
+   { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 },
+   { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 },
+   { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 },
+   { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 },
+   { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 },
+   { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 },
+   { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 },
+   { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff },
+   { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 },
+   { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x00000008 },
+   { 0x00010220, 0x34001c00, 0x02001400, 0xffffff48 },
+   { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 },
+   { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 },
+   { 0x00000220, 0x34001c00, 0x00001400, 0xffffff42 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
diff --git a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm
new file mode 100644
index 0000000..d93d879
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm
@@ -0,0 +1,20 @@
+// Module name: NV12_SCALING_NV12
+.kernel NV12_SCALING_NV12
+.code
+
+#define INC_SCALING
+        
+#include "SetupVPKernel.asm"
+#include "Multiple_Loop_Head.asm"
+#include "PL2_Scaling.asm"
+#include "PL16x8_PL8x4.asm"        
+#include "PL8x4_Save_NV12.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD  // End of Thread
+
+.end_code  
+
+.end_kernel
+
+// end of nv12_scaling_nv12.asm
diff --git a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5
new file mode 100644
index 0000000..6e99720
--- /dev/null
+++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5
@@ -0,0 +1,222 @@
+   { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 },
+   { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 },
+   { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 },
+   { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 },
+   { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 },
+   { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 },
+   { 0x00400001, 0x254002fd, 0x00000000, 0x48403000 },
+   { 0x00400001, 0x255002fd, 0x00000000, 0x5c585450 },
+   { 0x00600040, 0x25607fbd, 0x008d0540, 0x41000000 },
+   { 0x00200401, 0x21000061, 0x00000000, 0x00000000 },
+   { 0x00000801, 0x21080061, 0x00000000, 0x00000000 },
+   { 0x00802001, 0x208003be, 0x000000a8, 0x00000000 },
+   { 0x00802001, 0x258003bd, 0x000000a8, 0x00000000 },
+   { 0x00802001, 0x240003bc, 0x000000a4, 0x00000000 },
+   { 0x00802048, 0x204077be, 0x000000bc, 0x008d0540 },
+   { 0x00000401, 0x257003fd, 0x00000000, 0x437f0000 },
+   { 0x00000801, 0x257c03fd, 0x00000000, 0x3f000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21400229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22400229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23400229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21600229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22600229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23600229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21800229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22800229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23800229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21a00229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22a00229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23a00229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21c00229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22c00229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23c00229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x21e00229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x22e00229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x23e00229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x22000229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x23000229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x24000229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 },
+   { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 },
+   { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 },
+   { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 },
+   { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 },
+   { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 },
+   { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 },
+   { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 },
+   { 0x00800001, 0x22200229, 0x00cf0700, 0x00000000 },
+   { 0x00800001, 0x23200229, 0x00cf0800, 0x00000000 },
+   { 0x00800001, 0x24200229, 0x00cf0840, 0x00000000 },
+   { 0x00600001, 0x22400129, 0x00ae0240, 0x00000000 },
+   { 0x00600001, 0x23400129, 0x00ae0340, 0x00000000 },
+   { 0x00600001, 0x22500129, 0x00ae0280, 0x00000000 },
+   { 0x00600001, 0x23500129, 0x00ae0380, 0x00000000 },
+   { 0x00600001, 0x22600129, 0x00ae02c0, 0x00000000 },
+   { 0x00600001, 0x23600129, 0x00ae03c0, 0x00000000 },
+   { 0x00600001, 0x22700129, 0x00ae0300, 0x00000000 },
+   { 0x00600001, 0x23700129, 0x00ae0400, 0x00000000 },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x20280062, 0x00000000, 0x0007000f },
+   { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff },
+   { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 },
+   { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 },
+   { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 },
+   { 0x00000801, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 },
+   { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0007000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 },
+   { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 },
+   { 0x00710001, 0x24400169, 0x02000000, 0x00000000 },
+   { 0x00000001, 0x24600061, 0x00000000, 0x00040001 },
+   { 0x00000001, 0x24640061, 0x00000000, 0x00400010 },
+   { 0x00000001, 0x24680061, 0x00000000, 0x04000100 },
+   { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 },
+   { 0x00000001, 0x26020128, 0x00000440, 0x00000000 },
+   { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 },
+   { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 },
+   { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000442, 0x00000000 },
+   { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000444, 0x00000000 },
+   { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 },
+   { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 },
+   { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000446, 0x00000000 },
+   { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x00000448, 0x00000000 },
+   { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 },
+   { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 },
+   { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 },
+   { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 },
+   { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 },
+   { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 },
+   { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 },
+   { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 },
+   { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 },
+   { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 },
+   { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 },
+   { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 },
+   { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 },
+   { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 },
+   { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 },
+   { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 },
+   { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 },
+   { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 },
+   { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 },
+   { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 },
+   { 0x00000001, 0x21080061, 0x00000000, 0x0003000f },
+   { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 },
+   { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 },
+   { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 },
+   { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 },
+   { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 },
+   { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 },
+   { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff },
+   { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 },
+   { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 },
+   { 0x00000041, 0x24407fbd, 0x000000bc, 0x41800000 },
+   { 0x00000040, 0x20a477bd, 0x00000440, 0x000000a4 },
+   { 0x00010220, 0x34001c00, 0x00001400, 0x0000000e },
+   { 0x00010220, 0x34001c00, 0x02001400, 0xfffffe66 },
+   { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 },
+   { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 },
+   { 0x00000001, 0x20a403bd, 0x00000094, 0x00000000 },
+   { 0x00000041, 0x24407fbd, 0x00000038, 0x41000000 },
+   { 0x00000040, 0x20a877bd, 0x00000440, 0x000000a8 },
+   { 0x00000220, 0x34001c00, 0x00001400, 0xfffffe5a },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
+   { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 },
+   { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 },
diff --git a/libva-tpi.pc.in b/libva-tpi.pc.in
new file mode 100644
index 0000000..43616c0
--- /dev/null
+++ b/libva-tpi.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libva-tpi
+Description: Userspace Video Acceleration (VA) 3rd party interface
+Requires: libva
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -lva-tpi
+Cflags: -I${includedir}
diff --git a/va/va.h b/va/va.h
index 76c5708..0f12dfb 100644
--- a/va/va.h
+++ b/va/va.h
@@ -145,6 +145,14 @@ typedef int VAStatus;	/* Return status type from functions */
 /* Color space conversion flags for vaPutSurface() */
 #define VA_SRC_BT601            0x00000010
 #define VA_SRC_BT709            0x00000020
+#define VA_SRC_SMPTE_240        0x00000040
+
+/* Scaling flags for vaPutSurface() */
+#define VA_FILTER_SCALING_DEFAULT       0x00000000
+#define VA_FILTER_SCALING_FAST          0x00000100
+#define VA_FILTER_SCALING_HQ            0x00000200
+#define VA_FILTER_SCALING_NL_ANAMORPHIC 0x00000300
+#define VA_FILTER_SCALING_MASK          0x00000f00
 
 /*
  * Returns a short english description of error_status
@@ -1114,7 +1122,9 @@ typedef struct _VAEncPictureParameterBufferH264
     VABufferID coded_buf;
     unsigned short picture_width;
     unsigned short picture_height;
-    unsigned char last_picture; /* if set to 1 it indicates the last picture in the sequence */
+    unsigned char last_picture; /* if set to 1 it indicates the last picture in the sequence
+                                 * if set to 2 it indicates the last picture of the stream
+                                 */
 } VAEncPictureParameterBufferH264;
 
 /****************************
@@ -1685,15 +1695,6 @@ typedef enum
     VADISPLAYATTRIB_BLE_NONE,
 } VADisplayAttribBLEMode;
 
-typedef enum
-{ 
-    VADISPLAYATTRIB_CSC_FORMAT_YCC_BT601 = 0x00,
-    VADISPLAYATTRIB_CSC_FORMAT_YCC_BT709,
-    VADISPLAYATTRIB_CSC_FORMAT_YCC_SMPTE_240,
-    VADISPLAYATTRIB_CSC_FORMAT_RGB,
-    VADISPLAYATTRIB_CSC_FORMAT_NONE,
-} VADisplayAttribCSCFormat;
-
 /* attribute value for VADisplayAttribRotation   */
 #define VA_ROTATION_NONE        0x00000000
 #define VA_ROTATION_90          0x00000001
diff --git a/va/x11/Makefile.am b/va/x11/Makefile.am
index 2e3619c..31e381e 100644
--- a/va/x11/Makefile.am
+++ b/va/x11/Makefile.am
@@ -1,22 +1,24 @@
-# INTEL CONFIDENTIAL
-# Copyright 2007 Intel Corporation. All Rights Reserved.
+# Copyright (c) 2007 Intel Corporation. All Rights Reserved.
 #
-# The source code contained or described herein and all documents related to
-# the source code ("Material") are owned by Intel Corporation or its suppliers
-# or licensors. Title to the Material remains with Intel Corporation or its
-# suppliers and licensors. The Material may contain trade secrets and
-# proprietary and confidential information of Intel Corporation and its
-# suppliers and licensors, and is protected by worldwide copyright and trade
-# secret laws and treaty provisions. No part of the Material may be used,
-# copied, reproduced, modified, published, uploaded, posted, transmitted,
-# distributed, or disclosed in any way without Intel's prior express written
-# permission. 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
 # 
-# No license under any patent, copyright, trade secret or other intellectual
-# property right is granted to or conferred upon you by disclosure or delivery
-# of the Materials, either expressly, by implication, inducement, estoppel or
-# otherwise. Any license under such intellectual property rights must be
-# express and approved by Intel in writing.
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 AM_CFLAGS = -DLINUX -I$(top_srcdir)/va $(DRM_CFLAGS)