diff options
124 files changed, 10359 insertions, 46 deletions
diff --git a/Makefile.am b/Makefile.am index e75a2db..cf57b8f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -30,7 +30,7 @@ if BUILD_I965_DRIVER SUBDIRS += i965_drv_video endif -pcfiles = libva.pc +pcfiles = libva.pc libva-tpi.pc pcfiles += libva-x11.pc if USE_GLX pcfiles += libva-glx.pc @@ -39,6 +39,6 @@ endif pkgconfigdir = @pkgconfigdir@ pkgconfig_DATA = $(pcfiles) -EXTRA_DIST = libva.pc.in libva-x11.pc.in libva-glx.pc.in +EXTRA_DIST = libva.pc.in libva-tpi.pc.in libva-x11.pc.in libva-glx.pc.in CLEANFILES = $(pcfiles) diff --git a/configure.ac b/configure.ac index 8792dcc..ac54963 100644 --- a/configure.ac +++ b/configure.ac @@ -29,7 +29,7 @@ m4_define([libva_version], [libva_major_version.libva_minor_version.libva_micro_version]) # if the library source code has changed, increment revision -m4_define([libva_lt_revision], [4]) +m4_define([libva_lt_revision], [5]) # if any interface was added/removed/changed, then inc current, reset revision m4_define([libva_lt_current], [1]) # if any interface was added since last public release, then increment age @@ -163,6 +163,7 @@ AC_OUTPUT([ i965_drv_video/shaders/mpeg2/Makefile i965_drv_video/shaders/mpeg2/vld/Makefile i965_drv_video/shaders/render/Makefile + i965_drv_video/shaders/post_processing/Makefile test/Makefile test/basic/Makefile test/decode/Makefile @@ -171,5 +172,6 @@ AC_OUTPUT([ libva.pc libva-x11.pc libva-glx.pc + libva-tpi.pc ]) diff --git a/i965_drv_video/Makefile.am b/i965_drv_video/Makefile.am index 058b525..f32d579 100644 --- a/i965_drv_video/Makefile.am +++ b/i965_drv_video/Makefile.am @@ -42,7 +42,8 @@ i965_drv_video_la_SOURCES = \ i965_drv_video.c \ i965_avc_bsd.c \ i965_avc_hw_scoreboard.c\ - i965_avc_ildb.c + i965_avc_ildb.c \ + i965_post_processing.c noinst_HEADERS = \ object_heap.h \ @@ -59,4 +60,5 @@ noinst_HEADERS = \ i965_structs.h \ i965_avc_bsd.h \ i965_avc_hw_scoreboard.h\ - i965_avc_ildb.h + i965_avc_ildb.h \ + i965_post_processing.h diff --git a/i965_drv_video/i965_defines.h b/i965_drv_video/i965_defines.h index aa2baa3..839712e 100644 --- a/i965_drv_video/i965_defines.h +++ b/i965_drv_video/i965_defines.h @@ -357,6 +357,29 @@ #define SCOREBOARD_STALLING 0 #define SCOREBOARD_NON_STALLING 1 +#define SURFACE_FORMAT_YCRCB_NORMAL 0 +#define SURFACE_FORMAT_YCRCB_SWAPUVY 1 +#define SURFACE_FORMAT_YCRCB_SWAPUV 2 +#define SURFACE_FORMAT_YCRCB_SWAPY 3 +#define SURFACE_FORMAT_PLANAR_420_8 4 +#define SURFACE_FORMAT_PLANAR_411_8 5 +#define SURFACE_FORMAT_PLANAR_422_8 6 +#define SURFACE_FORMAT_STMM_DN_STATISTICS 7 +#define SURFACE_FORMAT_R10G10B10A2_UNORM 8 +#define SURFACE_FORMAT_R8G8B8A8_UNORM 9 +#define SURFACE_FORMAT_R8B8_UNORM 10 +#define SURFACE_FORMAT_R8_UNORM 11 +#define SURFACE_FORMAT_Y8_UNORM 12 + +#define AVS_FILTER_ADAPTIVE_8_TAP 0 +#define AVS_FILTER_NEAREST 1 + +#define IEF_FILTER_COMBO 0 +#define IEF_FILTER_DETAIL 1 + +#define IEF_FILTER_SIZE_3X3 0 +#define IEF_FILTER_SIZE_5X5 1 + #define URB_SIZE(intel) (IS_IRONLAKE(intel->device_id) ? 1024 : \ IS_G4X(intel->device_id) ? 384 : 256) #endif /* _I965_DEFINES_H_ */ diff --git a/i965_drv_video/i965_drv_video.c b/i965_drv_video/i965_drv_video.c index 104c105..ec5412d 100644 --- a/i965_drv_video/i965_drv_video.c +++ b/i965_drv_video/i965_drv_video.c @@ -350,6 +350,8 @@ i965_destroy_surface(struct object_heap *heap, struct object_base *obj) dri_bo_unreference(obj_surface->bo); obj_surface->bo = NULL; + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = NULL; if (obj_surface->free_private_data != NULL) { obj_surface->free_private_data(&obj_surface->private_data); @@ -395,6 +397,7 @@ i965_CreateSurfaces(VADriverContextP ctx, obj_surface->size = SIZE_YUV420(obj_surface->width, obj_surface->height); obj_surface->flags = SURFACE_REFERENCED; obj_surface->bo = NULL; + obj_surface->pp_out_bo = NULL; obj_surface->private_data = NULL; obj_surface->free_private_data = NULL; } @@ -1644,7 +1647,7 @@ i965_GetImage(VADriverContextP ctx, VAStatus i965_PutSurface(VADriverContextP ctx, VASurfaceID surface, - Drawable draw, /* X Drawable */ + void *draw, /* X Drawable */ short srcx, short srcy, unsigned short srcw, @@ -1667,6 +1670,7 @@ i965_PutSurface(VADriverContextP ctx, int ret; uint32_t name; Bool new_region = False; + int pp_flag = 0; /* Currently don't support DRI1 */ if (dri_state->driConnectedFlag != VA_DRI2) return VA_STATUS_ERROR_UNKNOWN; @@ -1678,7 +1682,7 @@ i965_PutSurface(VADriverContextP ctx, if (obj_surface->bo == NULL) return VA_STATUS_SUCCESS; - dri_drawable = dri_get_drawable(ctx, draw); + dri_drawable = dri_get_drawable(ctx, (Drawable)draw); assert(dri_drawable); buffer = dri_get_rendering_buffer(ctx, dri_drawable); @@ -1716,9 +1720,16 @@ i965_PutSurface(VADriverContextP ctx, assert(ret == 0); } + if ((flags & VA_FILTER_SCALING_MASK) == VA_FILTER_SCALING_NL_ANAMORPHIC) + pp_flag |= I965_PP_FLAG_AVS; + + if (flags & (VA_BOTTOM_FIELD | VA_TOP_FIELD)) + pp_flag |= I965_PP_FLAG_DEINTERLACING; + i965_render_put_surface(ctx, surface, srcx, srcy, srcw, srch, - destx, desty, destw, desth); + destx, desty, destw, desth, + pp_flag); if(obj_surface->subpic != VA_INVALID_ID) { i965_render_put_subpic(ctx, surface, diff --git a/i965_drv_video/i965_drv_video.h b/i965_drv_video/i965_drv_video.h index 8643bd6..7fc9cdb 100644 --- a/i965_drv_video/i965_drv_video.h +++ b/i965_drv_video/i965_drv_video.h @@ -109,6 +109,11 @@ struct object_surface int orig_height; int flags; dri_bo *bo; + int pp_out_width; + int pp_out_height; + int orig_pp_out_width; + int orig_pp_out_height; + dri_bo *pp_out_bo; void (*free_private_data)(void **data); void *private_data; }; diff --git a/i965_drv_video/i965_post_processing.c b/i965_drv_video/i965_post_processing.c new file mode 100644 index 0000000..633100c --- /dev/null +++ b/i965_drv_video/i965_post_processing.c @@ -0,0 +1,2029 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Xiang Haihao <haihao.xiang@intel.com> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include <va/va_backend.h> + +#include "intel_batchbuffer.h" +#include "intel_driver.h" + +#include "i965_defines.h" +#include "i965_post_processing.h" +#include "i965_render.h" +#include "i965_drv_video.h" + +struct pp_module +{ + /* kernel */ + char *name; + int interface; + unsigned int (*bin)[4]; + int size; + dri_bo *bo; + + /* others */ + void (*initialize)(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); +}; + +static uint32_t pp_null_gen5[][4] = { +#include "shaders/post_processing/null.g4b.gen5" +}; + +static uint32_t pp_nv12_load_save_gen5[][4] = { +#include "shaders/post_processing/nv12_load_save_nv12.g4b.gen5" +}; + +static uint32_t pp_nv12_scaling_gen5[][4] = { +#include "shaders/post_processing/nv12_scaling_nv12.g4b.gen5" +}; + +static uint32_t pp_nv12_avs_gen5[][4] = { +#include "shaders/post_processing/nv12_avs_nv12.g4b.gen5" +}; + +static uint32_t pp_nv12_dndi_gen5[][4] = { +#include "shaders/post_processing/nv12_dndi_nv12.g4b.gen5" +}; + +static void ironlake_pp_null_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); +static void ironlake_pp_nv12_avs_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); +static void ironlake_pp_nv12_scaling_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); +static void ironlake_pp_nv12_load_save_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); +static void ironlake_pp_nv12_dndi_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth); + +static struct pp_module pp_modules_gen5[] = { + { + "NULL module (for testing)", + PP_NULL, + pp_null_gen5, + sizeof(pp_null_gen5), + NULL, + ironlake_pp_null_initialize, + }, + + { + "NV12 Load & Save module", + PP_NV12_LOAD_SAVE, + pp_nv12_load_save_gen5, + sizeof(pp_nv12_load_save_gen5), + NULL, + ironlake_pp_nv12_load_save_initialize, + }, + + { + "NV12 Scaling module", + PP_NV12_SCALING, + pp_nv12_scaling_gen5, + sizeof(pp_nv12_scaling_gen5), + NULL, + ironlake_pp_nv12_scaling_initialize, + }, + + { + "NV12 AVS module", + PP_NV12_AVS, + pp_nv12_avs_gen5, + sizeof(pp_nv12_avs_gen5), + NULL, + ironlake_pp_nv12_avs_initialize, + }, + + { + "NV12 DNDI module", + PP_NV12_DNDI, + pp_nv12_dndi_gen5, + sizeof(pp_nv12_dndi_gen5), + NULL, + ironlake_pp_nv12_dndi_initialize, + }, +}; + +#define NUM_PP_MODULES ARRAY_ELEMS(pp_modules_gen5) + +static struct pp_module *pp_modules = NULL; + +struct ironlake_pp_static_parameter +{ + struct { + /* Procamp r1.0 */ + float procamp_constant_c0; + + /* Load and Same r1.1 */ + unsigned int source_packed_y_offset:8; + unsigned int source_packed_u_offset:8; + unsigned int source_packed_v_offset:8; + unsigned int pad0:8; + + union { + /* Load and Save r1.2 */ + struct { + unsigned int destination_packed_y_offset:8; + unsigned int destination_packed_u_offset:8; + unsigned int destination_packed_v_offset:8; + unsigned int pad0:8; + } load_and_save; + + /* CSC r1.2 */ + struct { + unsigned int destination_rgb_format:8; + unsigned int pad0:24; + } csc; + } r1_2; + + /* Procamp r1.3 */ + float procamp_constant_c1; + + /* Procamp r1.4 */ + float procamp_constant_c2; + + /* DI r1.5 */ + unsigned int statistics_surface_picth:16; /* Devided by 2 */ + unsigned int pad1:16; + + union { + /* DI r1.6 */ + struct { + unsigned int pad0:24; + unsigned int top_field_first:8; + } di; + + /* AVS/Scaling r1.6 */ + float normalized_video_y_scaling_step; + } r1_6; + + /* Procamp r1.7 */ + float procamp_constant_c5; + } grf1; + + struct { + /* Procamp r2.0 */ + float procamp_constant_c3; + + /* MBZ r2.1*/ + unsigned int pad0; + + /* WG+CSC r2.2 */ + float wg_csc_constant_c4; + + /* WG+CSC r2.3 */ + float wg_csc_constant_c8; + + /* Procamp r2.4 */ + float procamp_constant_c4; + + /* MBZ r2.5 */ + unsigned int pad1; + + /* MBZ r2.6 */ + unsigned int pad2; + + /* WG+CSC r2.7 */ + float wg_csc_constant_c9; + } grf2; + + struct { + /* WG+CSC r3.0 */ + float wg_csc_constant_c0; + + /* Blending r3.1 */ + float scaling_step_ratio; + + /* Blending r3.2 */ + float normalized_alpha_y_scaling; + + /* WG+CSC r3.3 */ + float wg_csc_constant_c4; + + /* WG+CSC r3.4 */ + float wg_csc_constant_c1; + + /* ALL r3.5 */ + int horizontal_origin_offset:16; + int vertical_origin_offset:16; + + /* Shared r3.6*/ + union { + /* Color filll */ + unsigned int color_pixel; + + /* WG+CSC */ + float wg_csc_constant_c2; + } r3_6; + + /* WG+CSC r3.7 */ + float wg_csc_constant_c3; + } grf3; + + struct { + /* WG+CSC r4.0 */ + float wg_csc_constant_c6; + + /* ALL r4.1 MBZ ???*/ + unsigned int pad0; + + /* Shared r4.2 */ + union { + /* AVS */ + struct { + unsigned int pad1:15; + unsigned int nlas:1; + unsigned int pad2:16; + } avs; + + /* DI */ + struct { + unsigned int motion_history_coefficient_m2:8; + unsigned int motion_history_coefficient_m1:8; + unsigned int pad0:16; + } di; + } r4_2; + + /* WG+CSC r4.3 */ + float wg_csc_constant_c7; + + /* WG+CSC r4.4 */ + float wg_csc_constant_c10; + + /* AVS r4.5 */ + float source_video_frame_normalized_horizontal_origin; + + /* MBZ r4.6 */ + unsigned int pad1; + + /* WG+CSC r4.7 */ + float wg_csc_constant_c11; + } grf4; +}; + +struct ironlake_pp_inline_parameter +{ + struct { + /* ALL r5.0 */ + int destination_block_horizontal_origin:16; + int destination_block_vertical_origin:16; + + /* Shared r5.1 */ + union { + /* AVS/Scaling */ + float source_surface_block_normalized_horizontal_origin; + + /* FMD */ + struct { + unsigned int variance_surface_vertical_origin:16; + unsigned int pad0:16; + } fmd; + } r5_1; + + /* AVS/Scaling r5.2 */ + float source_surface_block_normalized_vertical_origin; + + /* Alpha r5.3 */ + float alpha_surface_block_normalized_horizontal_origin; + + /* Alpha r5.4 */ + float alpha_surface_block_normalized_vertical_origin; + + /* Alpha r5.5 */ + unsigned int alpha_mask_x:16; + unsigned int alpha_mask_y:8; + unsigned int block_count_x:8; + + /* r5.6 */ + unsigned int block_horizontal_mask:16; + unsigned int block_vertical_mask:8; + unsigned int number_blocks:8; + + /* AVS/Scaling r5.7 */ + float normalized_video_x_scaling_step; + } grf5; + + struct { + /* AVS r6.0 */ + float video_step_delta; + + /* r6.1-r6.7 */ + unsigned int padx[7]; + } grf6; +}; + +static struct ironlake_pp_static_parameter ironlake_pp_static_parameter; +static struct ironlake_pp_inline_parameter ironlake_pp_inline_parameter; + +static void +ironlake_pp_surface_state(struct i965_post_processing_context *pp_context) +{ + +} + +static void +ironlake_pp_interface_descriptor_table(struct i965_post_processing_context *pp_context) +{ + struct i965_interface_descriptor *desc; + dri_bo *bo; + int pp_index = pp_context->current_pp; + + bo = pp_context->idrt.bo; + dri_bo_map(bo, 1); + assert(bo->virtual); + desc = bo->virtual; + memset(desc, 0, sizeof(*desc)); + desc->desc0.grf_reg_blocks = 10; + desc->desc0.kernel_start_pointer = pp_modules[pp_index].bo->offset >> 6; /* reloc */ + desc->desc1.const_urb_entry_read_offset = 0; + desc->desc1.const_urb_entry_read_len = 4; /* grf 1-4 */ + desc->desc2.sampler_state_pointer = pp_context->sampler_state_table.bo->offset >> 5; + desc->desc2.sampler_count = 0; + desc->desc3.binding_table_entry_count = 0; + desc->desc3.binding_table_pointer = + pp_context->binding_table.bo->offset >> 5; /*reloc */ + + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + desc->desc0.grf_reg_blocks, + offsetof(struct i965_interface_descriptor, desc0), + pp_modules[pp_index].bo); + + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + desc->desc2.sampler_count << 2, + offsetof(struct i965_interface_descriptor, desc2), + pp_context->sampler_state_table.bo); + + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + desc->desc3.binding_table_entry_count, + offsetof(struct i965_interface_descriptor, desc3), + pp_context->binding_table.bo); + + dri_bo_unmap(bo); +} + +static void +ironlake_pp_binding_table(struct i965_post_processing_context *pp_context) +{ + unsigned int *binding_table; + dri_bo *bo = pp_context->binding_table.bo; + int i; + + dri_bo_map(bo, 1); + assert(bo->virtual); + binding_table = bo->virtual; + memset(binding_table, 0, bo->size); + + for (i = 0; i < MAX_PP_SURFACES; i++) { + if (pp_context->surfaces[i].ss_bo) { + assert(pp_context->surfaces[i].s_bo); + + binding_table[i] = pp_context->surfaces[i].ss_bo->offset; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + 0, + i * sizeof(*binding_table), + pp_context->surfaces[i].ss_bo); + } + + } + + dri_bo_unmap(bo); +} + +static void +ironlake_pp_vfe_state(struct i965_post_processing_context *pp_context) +{ + struct i965_vfe_state *vfe_state; + dri_bo *bo; + + bo = pp_context->vfe_state.bo; + dri_bo_map(bo, 1); + assert(bo->virtual); + vfe_state = bo->virtual; + memset(vfe_state, 0, sizeof(*vfe_state)); + vfe_state->vfe1.max_threads = pp_context->urb.num_vfe_entries - 1; + vfe_state->vfe1.urb_entry_alloc_size = pp_context->urb.size_vfe_entry - 1; + vfe_state->vfe1.num_urb_entries = pp_context->urb.num_vfe_entries; + vfe_state->vfe1.vfe_mode = VFE_GENERIC_MODE; + vfe_state->vfe1.children_present = 0; + vfe_state->vfe2.interface_descriptor_base = + pp_context->idrt.bo->offset >> 4; /* reloc */ + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + 0, + offsetof(struct i965_vfe_state, vfe2), + pp_context->idrt.bo); + dri_bo_unmap(bo); +} + +static void +ironlake_pp_upload_constants(struct i965_post_processing_context *pp_context) +{ + unsigned char *constant_buffer; + + assert(sizeof(ironlake_pp_static_parameter) == 128); + dri_bo_map(pp_context->curbe.bo, 1); + assert(pp_context->curbe.bo->virtual); + constant_buffer = pp_context->curbe.bo->virtual; + memcpy(constant_buffer, &ironlake_pp_static_parameter, sizeof(ironlake_pp_static_parameter)); + dri_bo_unmap(pp_context->curbe.bo); +} + +static void +ironlake_pp_states_setup(VADriverContextP ctx) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + + ironlake_pp_surface_state(pp_context); + ironlake_pp_binding_table(pp_context); + ironlake_pp_interface_descriptor_table(pp_context); + ironlake_pp_vfe_state(pp_context); + ironlake_pp_upload_constants(pp_context); +} + +static void +ironlake_pp_pipeline_select(VADriverContextP ctx) +{ + BEGIN_BATCH(ctx, 1); + OUT_BATCH(ctx, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_urb_layout(VADriverContextP ctx, struct i965_post_processing_context *pp_context) +{ + unsigned int vfe_fence, cs_fence; + + vfe_fence = pp_context->urb.cs_start; + cs_fence = pp_context->urb.size; + + BEGIN_BATCH(ctx, 3); + OUT_BATCH(ctx, CMD_URB_FENCE | UF0_VFE_REALLOC | UF0_CS_REALLOC | 1); + OUT_BATCH(ctx, 0); + OUT_BATCH(ctx, + (vfe_fence << UF2_VFE_FENCE_SHIFT) | /* VFE_SIZE */ + (cs_fence << UF2_CS_FENCE_SHIFT)); /* CS_SIZE */ + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_state_base_address(VADriverContextP ctx) +{ + BEGIN_BATCH(ctx, 8); + OUT_BATCH(ctx, CMD_STATE_BASE_ADDRESS | 6); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(ctx, 0 | BASE_ADDRESS_MODIFY); + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_state_pointers(VADriverContextP ctx, struct i965_post_processing_context *pp_context) +{ + BEGIN_BATCH(ctx, 3); + OUT_BATCH(ctx, CMD_MEDIA_STATE_POINTERS | 1); + OUT_BATCH(ctx, 0); + OUT_RELOC(ctx, pp_context->vfe_state.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_cs_urb_layout(VADriverContextP ctx, struct i965_post_processing_context *pp_context) +{ + BEGIN_BATCH(ctx, 2); + OUT_BATCH(ctx, CMD_CS_URB_STATE | 0); + OUT_BATCH(ctx, + ((pp_context->urb.size_cs_entry - 1) << 4) | /* URB Entry Allocation Size */ + (pp_context->urb.num_cs_entries << 0)); /* Number of URB Entries */ + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_constant_buffer(VADriverContextP ctx, struct i965_post_processing_context *pp_context) +{ + BEGIN_BATCH(ctx, 2); + OUT_BATCH(ctx, CMD_CONSTANT_BUFFER | (1 << 8) | (2 - 2)); + OUT_RELOC(ctx, pp_context->curbe.bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + pp_context->urb.size_cs_entry - 1); + ADVANCE_BATCH(ctx); +} + +static void +ironlake_pp_object_walker(VADriverContextP ctx, struct i965_post_processing_context *pp_context) +{ + int x, x_steps, y, y_steps; + + x_steps = pp_context->pp_x_steps(&pp_context->private_context); + y_steps = pp_context->pp_y_steps(&pp_context->private_context); + + for (y = 0; y < y_steps; y++) { + for (x = 0; x < x_steps; x++) { + if (!pp_context->pp_set_block_parameter(&pp_context->private_context, x, y)) { + BEGIN_BATCH(ctx, 20); + OUT_BATCH(ctx, CMD_MEDIA_OBJECT | 18); + OUT_BATCH(ctx, 0); + OUT_BATCH(ctx, 0); /* no indirect data */ + OUT_BATCH(ctx, 0); + + /* inline data grf 5-6 */ + assert(sizeof(ironlake_pp_inline_parameter) == 64); + intel_batchbuffer_data(ctx, &ironlake_pp_inline_parameter, sizeof(ironlake_pp_inline_parameter)); + + ADVANCE_BATCH(ctx); + } + } + } +} + +static void +ironlake_pp_pipeline_setup(VADriverContextP ctx) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + + intel_batchbuffer_start_atomic(ctx, 0x1000); + intel_batchbuffer_emit_mi_flush(ctx); + ironlake_pp_pipeline_select(ctx); + ironlake_pp_state_base_address(ctx); + ironlake_pp_state_pointers(ctx, pp_context); + ironlake_pp_urb_layout(ctx, pp_context); + ironlake_pp_cs_urb_layout(ctx, pp_context); + ironlake_pp_constant_buffer(ctx, pp_context); + ironlake_pp_object_walker(ctx, pp_context); + intel_batchbuffer_end_atomic(ctx); +} + +static int +pp_null_x_steps(void *private_context) +{ + return 1; +} + +static int +pp_null_y_steps(void *private_context) +{ + return 1; +} + +static int +pp_null_set_block_parameter(void *private_context, int x, int y) +{ + return 0; +} + +static void +ironlake_pp_null_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct object_surface *obj_surface; + + /* surface */ + obj_surface = SURFACE(surface); + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = obj_surface->bo; + dri_bo_reference(obj_surface->pp_out_bo); + assert(obj_surface->pp_out_bo); + obj_surface->pp_out_width = obj_surface->width; + obj_surface->pp_out_height = obj_surface->height; + obj_surface->orig_pp_out_width = obj_surface->orig_width; + obj_surface->orig_pp_out_height = obj_surface->orig_height; + + /* private function & data */ + pp_context->pp_x_steps = pp_null_x_steps; + pp_context->pp_y_steps = pp_null_y_steps; + pp_context->pp_set_block_parameter = pp_null_set_block_parameter; +} + +static int +pp_load_save_x_steps(void *private_context) +{ + return 1; +} + +static int +pp_load_save_y_steps(void *private_context) +{ + struct pp_load_save_context *pp_load_save_context = private_context; + + return pp_load_save_context->dest_h / 8; +} + +static int +pp_load_save_set_block_parameter(void *private_context, int x, int y) +{ + ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff; + ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff; + ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16; + ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8; + + return 0; +} + +static void +ironlake_pp_nv12_load_save_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct pp_load_save_context *pp_load_save_context = (struct pp_load_save_context *)&pp_context->private_context; + struct object_surface *obj_surface; + struct i965_surface_state *ss; + dri_bo *bo; + int index, w, h; + int orig_w, orig_h; + + /* surface */ + obj_surface = SURFACE(surface); + orig_w = obj_surface->orig_width; + orig_h = obj_surface->orig_height; + w = obj_surface->width; + h = obj_surface->height; + + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr, + "intermediate surface", + SIZE_YUV420(w, h), + 4096); + assert(obj_surface->pp_out_bo); + obj_surface->pp_out_width = obj_surface->width; + obj_surface->pp_out_height = obj_surface->height; + obj_surface->orig_pp_out_width = obj_surface->orig_width; + obj_surface->orig_pp_out_height = obj_surface->orig_height; + + /* source Y surface index 1 */ + index = 1; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = orig_w / 4 - 1; + ss->ss2.height = orig_h - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* source UV surface index 2 */ + index = 2; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h; + ss->ss2.width = orig_w / 4 - 1; + ss->ss2.height = orig_h / 2 - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + w * h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination Y surface index 7 */ + index = 7; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = orig_w / 4 - 1; + ss->ss2.height = orig_h - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination UV surface index 8 */ + index = 8; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h; + ss->ss2.width = orig_w / 4 - 1; + ss->ss2.height = orig_h / 2 - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + w * h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* private function & data */ + pp_context->pp_x_steps = pp_load_save_x_steps; + pp_context->pp_y_steps = pp_load_save_y_steps; + pp_context->pp_set_block_parameter = pp_load_save_set_block_parameter; + pp_load_save_context->dest_h = h; + pp_load_save_context->dest_w = w; + + ironlake_pp_inline_parameter.grf5.block_count_x = w / 16; /* 1 x N */ + ironlake_pp_inline_parameter.grf5.number_blocks = w / 16; +} + +static int +pp_scaling_x_steps(void *private_context) +{ + return 1; +} + +static int +pp_scaling_y_steps(void *private_context) +{ + struct pp_scaling_context *pp_scaling_context = private_context; + + return pp_scaling_context->dest_h / 8; +} + +static int +pp_scaling_set_block_parameter(void *private_context, int x, int y) +{ + float src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + float src_y_steping = ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step; + + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = src_x_steping * x * 16; + ironlake_pp_inline_parameter.grf5.source_surface_block_normalized_vertical_origin = src_y_steping * y * 8; + ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16; + ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8; + + return 0; +} + +static void +ironlake_pp_nv12_scaling_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct pp_scaling_context *pp_scaling_context = (struct pp_scaling_context *)&pp_context->private_context; + struct object_surface *obj_surface; + struct i965_sampler_state *sampler_state; + struct i965_surface_state *ss; + dri_bo *bo; + int index; + int w, h; + int orig_w, orig_h; + int pp_out_w, pp_out_h; + int orig_pp_out_w, orig_pp_out_h; + + /* surface */ + obj_surface = SURFACE(surface); + orig_w = obj_surface->orig_width; + orig_h = obj_surface->orig_height; + w = obj_surface->width; + h = obj_surface->height; + + orig_pp_out_w = destw; + orig_pp_out_h = desth; + pp_out_w = ALIGN(orig_pp_out_w, 16); + pp_out_h = ALIGN(orig_pp_out_h, 16); + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr, + "intermediate surface", + SIZE_YUV420(pp_out_w, pp_out_h), + 4096); + assert(obj_surface->pp_out_bo); + obj_surface->orig_pp_out_width = orig_pp_out_w; + obj_surface->orig_pp_out_height = orig_pp_out_h; + obj_surface->pp_out_width = pp_out_w; + obj_surface->pp_out_height = pp_out_h; + + /* source Y surface index 1 */ + index = 1; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = orig_w - 1; + ss->ss2.height = orig_h - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* source UV surface index 2 */ + index = 2; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h; + ss->ss2.width = orig_w / 2 - 1; + ss->ss2.height = orig_h / 2 - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + w * h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination Y surface index 7 */ + index = 7; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = pp_out_w / 4 - 1; + ss->ss2.height = pp_out_h - 1; + ss->ss3.pitch = pp_out_w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination UV surface index 8 */ + index = 8; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + pp_out_w * pp_out_h; + ss->ss2.width = pp_out_w / 4 - 1; + ss->ss2.height = pp_out_h / 2 - 1; + ss->ss3.pitch = pp_out_w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + pp_out_w * pp_out_h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* sampler state */ + dri_bo_map(pp_context->sampler_state_table.bo, True); + assert(pp_context->sampler_state_table.bo->virtual); + sampler_state = pp_context->sampler_state_table.bo->virtual; + + /* SIMD16 Y index 1 */ + sampler_state[1].ss0.min_filter = I965_MAPFILTER_LINEAR; + sampler_state[1].ss0.mag_filter = I965_MAPFILTER_LINEAR; + sampler_state[1].ss1.r_wrap_mode = I965_TEXCOORDMODE_CLAMP; + sampler_state[1].ss1.s_wrap_mode = I965_TEXCOORDMODE_CLAMP; + sampler_state[1].ss1.t_wrap_mode = I965_TEXCOORDMODE_CLAMP; + + /* SIMD16 UV index 2 */ + sampler_state[2].ss0.min_filter = I965_MAPFILTER_LINEAR; + sampler_state[2].ss0.mag_filter = I965_MAPFILTER_LINEAR; + sampler_state[2].ss1.r_wrap_mode = I965_TEXCOORDMODE_CLAMP; + sampler_state[2].ss1.s_wrap_mode = I965_TEXCOORDMODE_CLAMP; + sampler_state[2].ss1.t_wrap_mode = I965_TEXCOORDMODE_CLAMP; + + dri_bo_unmap(pp_context->sampler_state_table.bo); + + /* private function & data */ + pp_context->pp_x_steps = pp_scaling_x_steps; + pp_context->pp_y_steps = pp_scaling_y_steps; + pp_context->pp_set_block_parameter = pp_scaling_set_block_parameter; + + pp_scaling_context->dest_w = pp_out_w; + pp_scaling_context->dest_h = pp_out_h; + + ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step = (float) 1.0 / pp_out_h; + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = (float) 1.0 / pp_out_w; + ironlake_pp_inline_parameter.grf5.block_count_x = pp_out_w / 16; /* 1 x N */ + ironlake_pp_inline_parameter.grf5.number_blocks = pp_out_w / 16; + ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff; + ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff; +} + +static int +pp_avs_x_steps(void *private_context) +{ + struct pp_avs_context *pp_avs_context = private_context; + + return pp_avs_context->dest_w / 16; +} + +static int +pp_avs_y_steps(void *private_context) +{ + return 1; +} + +static int +pp_avs_set_block_parameter(void *private_context, int x, int y) +{ + struct pp_avs_context *pp_avs_context = private_context; + float src_x_steping, src_y_steping, video_step_delta; + int tmp_w = ALIGN(pp_avs_context->dest_h * pp_avs_context->src_w / pp_avs_context->src_h, 16); + + if (tmp_w >= pp_avs_context->dest_w) { + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / tmp_w; + ironlake_pp_inline_parameter.grf6.video_step_delta = 0; + + if (x == 0) { + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = (float)(tmp_w - pp_avs_context->dest_w) / tmp_w / 2; + } else { + src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 + + 16 * 15 * video_step_delta / 2; + } + } else { + int n0, n1, n2, nls_left, nls_right; + int factor_a = 5, factor_b = 4; + float f; + + n0 = (pp_avs_context->dest_w - tmp_w) / (16 * 2); + n1 = (pp_avs_context->dest_w - tmp_w) / 16 - n0; + n2 = tmp_w / (16 * factor_a); + nls_left = n0 + n2; + nls_right = n1 + n2; + f = (float) n2 * 16 / tmp_w; + + if (n0 < 5) { + ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0; + + if (x == 0) { + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / pp_avs_context->dest_w; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = 0.0; + } else { + src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 + + 16 * 15 * video_step_delta / 2; + } + } else { + if (x < nls_left) { + /* f = a * nls_left * 16 + b * nls_left * 16 * (nls_left * 16 - 1) / 2 */ + float a = f / (nls_left * 16 * factor_b); + float b = (f - nls_left * 16 * a) * 2 / (nls_left * 16 * (nls_left * 16 - 1)); + + ironlake_pp_inline_parameter.grf6.video_step_delta = b; + + if (x == 0) { + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin = 0.0; + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = a; + } else { + src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 + + 16 * 15 * video_step_delta / 2; + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step += 16 * b; + } + } else if (x < (pp_avs_context->dest_w / 16 - nls_right)) { + /* scale the center linearly */ + src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 + + 16 * 15 * video_step_delta / 2; + ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0; + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = 1.0 / tmp_w; + } else { + float a = f / (nls_right * 16 * factor_b); + float b = (f - nls_right * 16 * a) * 2 / (nls_right * 16 * (nls_right * 16 - 1)); + + src_x_steping = ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step; + video_step_delta = ironlake_pp_inline_parameter.grf6.video_step_delta; + ironlake_pp_inline_parameter.grf5.r5_1.source_surface_block_normalized_horizontal_origin += src_x_steping * 16 + + 16 * 15 * video_step_delta / 2; + ironlake_pp_inline_parameter.grf6.video_step_delta = -b; + + if (x == (pp_avs_context->dest_w / 16 - nls_right)) + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = a + (nls_right * 16 - 1) * b; + else + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step -= b * 16; + } + } + } + + src_y_steping = ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step; + ironlake_pp_inline_parameter.grf5.source_surface_block_normalized_vertical_origin = src_y_steping * y * 8; + ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16; + ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 8; + + return 0; +} + +static void +ironlake_pp_nv12_avs_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct pp_avs_context *pp_avs_context = (struct pp_avs_context *)&pp_context->private_context; + struct object_surface *obj_surface; + struct i965_surface_state *ss; + struct i965_sampler_8x8 *sampler_8x8; + struct i965_sampler_8x8_state *sampler_8x8_state; + struct i965_surface_state2 *ss_8x8; + dri_bo *bo; + int index; + int w, h; + int orig_w, orig_h; + int pp_out_w, pp_out_h; + int orig_pp_out_w, orig_pp_out_h; + + /* surface */ + obj_surface = SURFACE(surface); + + if (input == 1) { + assert(obj_surface->pp_out_bo); + orig_w = obj_surface->orig_pp_out_width; + orig_h = obj_surface->orig_pp_out_height; + w = obj_surface->pp_out_width; + h = obj_surface->pp_out_height; + } else { + orig_w = obj_surface->orig_width; + orig_h = obj_surface->orig_height; + w = obj_surface->width; + h = obj_surface->height; + } + /* source Y surface index 1 */ + index = 1; + pp_context->surfaces[index].s_bo = (input == 1 ? obj_surface->pp_out_bo : obj_surface->bo); + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "Y surface state for sample_8x8", + sizeof(struct i965_surface_state2), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss_8x8 = bo->virtual; + memset(ss_8x8, 0, sizeof(*ss_8x8)); + ss_8x8->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset; + ss_8x8->ss1.cbcr_pixel_offset_v_direction = 0; + ss_8x8->ss1.width = orig_w - 1; + ss_8x8->ss1.height = orig_h - 1; + ss_8x8->ss2.half_pitch_for_chroma = 0; + ss_8x8->ss2.pitch = w - 1; + ss_8x8->ss2.interleave_chroma = 0; + ss_8x8->ss2.surface_format = SURFACE_FORMAT_Y8_UNORM; + ss_8x8->ss3.x_offset_for_cb = 0; + ss_8x8->ss3.y_offset_for_cb = 0; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + offsetof(struct i965_surface_state2, ss0), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* source UV surface index 2 */ + index = 2; + pp_context->surfaces[index].s_bo = (input == 1 ? obj_surface->pp_out_bo : obj_surface->bo); + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "UV surface state for sample_8x8", + sizeof(struct i965_surface_state2), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss_8x8 = bo->virtual; + memset(ss_8x8, 0, sizeof(*ss_8x8)); + ss_8x8->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset + w * h; + ss_8x8->ss1.cbcr_pixel_offset_v_direction = 0; + ss_8x8->ss1.width = orig_w / 2 - 1; + ss_8x8->ss1.height = orig_h / 2 - 1; + ss_8x8->ss2.half_pitch_for_chroma = 0; + ss_8x8->ss2.pitch = w - 1; + ss_8x8->ss2.interleave_chroma = 0; + ss_8x8->ss2.surface_format = SURFACE_FORMAT_R8B8_UNORM; + ss_8x8->ss3.x_offset_for_cb = 0; + ss_8x8->ss3.y_offset_for_cb = 0; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + w * h, + offsetof(struct i965_surface_state2, ss0), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + orig_pp_out_w = destw; + orig_pp_out_h = desth; + pp_out_w = ALIGN(orig_pp_out_w, 16); + pp_out_h = ALIGN(orig_pp_out_h, 16); + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr, + "intermediate surface", + SIZE_YUV420(pp_out_w, pp_out_h), + 4096); + assert(obj_surface->pp_out_bo); + obj_surface->orig_pp_out_width = orig_pp_out_w; + obj_surface->orig_pp_out_height = orig_pp_out_h; + obj_surface->pp_out_width = pp_out_w; + obj_surface->pp_out_height = pp_out_h; + + /* destination Y surface index 7 */ + index = 7; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = pp_out_w / 4 - 1; + ss->ss2.height = pp_out_h - 1; + ss->ss3.pitch = pp_out_w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination UV surface index 8 */ + index = 8; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + pp_out_w * pp_out_h; + ss->ss2.width = pp_out_w / 4 - 1; + ss->ss2.height = pp_out_h / 2 - 1; + ss->ss3.pitch = pp_out_w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + pp_out_w * pp_out_h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* sampler 8x8 state */ + dri_bo_map(pp_context->sampler_state_table.bo_8x8, True); + assert(pp_context->sampler_state_table.bo_8x8->virtual); + assert(sizeof(*sampler_8x8_state) == sizeof(int) * 138); + sampler_8x8_state = pp_context->sampler_state_table.bo_8x8->virtual; + memset(sampler_8x8_state, 0, sizeof(*sampler_8x8_state)); + sampler_8x8_state->dw136.default_sharpness_level = 0; + sampler_8x8_state->dw137.adaptive_filter_for_all_channel = 1; + sampler_8x8_state->dw137.bypass_y_adaptive_filtering = 1; + sampler_8x8_state->dw137.bypass_x_adaptive_filtering = 1; + dri_bo_unmap(pp_context->sampler_state_table.bo_8x8); + + /* sampler 8x8 */ + dri_bo_map(pp_context->sampler_state_table.bo, True); + assert(pp_context->sampler_state_table.bo->virtual); + assert(sizeof(*sampler_8x8) == sizeof(int) * 16); + sampler_8x8 = pp_context->sampler_state_table.bo->virtual; + + /* sample_8x8 Y index 1 */ + index = 1; + memset(&sampler_8x8[index], 0, sizeof(*sampler_8x8)); + sampler_8x8[index].dw0.avs_filter_type = AVS_FILTER_ADAPTIVE_8_TAP; + sampler_8x8[index].dw0.ief_bypass = 0; + sampler_8x8[index].dw0.ief_filter_type = IEF_FILTER_DETAIL; + sampler_8x8[index].dw0.ief_filter_size = IEF_FILTER_SIZE_5X5; + sampler_8x8[index].dw1.sampler_8x8_state_pointer = pp_context->sampler_state_table.bo_8x8->offset >> 5; + sampler_8x8[index].dw2.global_noise_estimation = 22; + sampler_8x8[index].dw2.strong_edge_threshold = 8; + sampler_8x8[index].dw2.weak_edge_threshold = 1; + sampler_8x8[index].dw3.strong_edge_weight = 7; + sampler_8x8[index].dw3.regular_weight = 2; + sampler_8x8[index].dw3.non_edge_weight = 0; + sampler_8x8[index].dw3.gain_factor = 40; + sampler_8x8[index].dw4.steepness_boost = 0; + sampler_8x8[index].dw4.steepness_threshold = 0; + sampler_8x8[index].dw4.mr_boost = 0; + sampler_8x8[index].dw4.mr_threshold = 5; + sampler_8x8[index].dw5.pwl1_point_1 = 4; + sampler_8x8[index].dw5.pwl1_point_2 = 12; + sampler_8x8[index].dw5.pwl1_point_3 = 16; + sampler_8x8[index].dw5.pwl1_point_4 = 26; + sampler_8x8[index].dw6.pwl1_point_5 = 40; + sampler_8x8[index].dw6.pwl1_point_6 = 160; + sampler_8x8[index].dw6.pwl1_r3_bias_0 = 127; + sampler_8x8[index].dw6.pwl1_r3_bias_1 = 98; + sampler_8x8[index].dw7.pwl1_r3_bias_2 = 88; + sampler_8x8[index].dw7.pwl1_r3_bias_3 = 64; + sampler_8x8[index].dw7.pwl1_r3_bias_4 = 44; + sampler_8x8[index].dw7.pwl1_r3_bias_5 = 0; + sampler_8x8[index].dw8.pwl1_r3_bias_6 = 0; + sampler_8x8[index].dw8.pwl1_r5_bias_0 = 3; + sampler_8x8[index].dw8.pwl1_r5_bias_1 = 32; + sampler_8x8[index].dw8.pwl1_r5_bias_2 = 32; + sampler_8x8[index].dw9.pwl1_r5_bias_3 = 58; + sampler_8x8[index].dw9.pwl1_r5_bias_4 = 100; + sampler_8x8[index].dw9.pwl1_r5_bias_5 = 108; + sampler_8x8[index].dw9.pwl1_r5_bias_6 = 88; + sampler_8x8[index].dw10.pwl1_r3_slope_0 = -116; + sampler_8x8[index].dw10.pwl1_r3_slope_1 = -20; + sampler_8x8[index].dw10.pwl1_r3_slope_2 = -96; + sampler_8x8[index].dw10.pwl1_r3_slope_3 = -32; + sampler_8x8[index].dw11.pwl1_r3_slope_4 = -50; + sampler_8x8[index].dw11.pwl1_r3_slope_5 = 0; + sampler_8x8[index].dw11.pwl1_r3_slope_6 = 0; + sampler_8x8[index].dw11.pwl1_r5_slope_0 = 116; + sampler_8x8[index].dw12.pwl1_r5_slope_1 = 0; + sampler_8x8[index].dw12.pwl1_r5_slope_2 = 114; + sampler_8x8[index].dw12.pwl1_r5_slope_3 = 67; + sampler_8x8[index].dw12.pwl1_r5_slope_4 = 9; + sampler_8x8[index].dw13.pwl1_r5_slope_5 = -3; + sampler_8x8[index].dw13.pwl1_r5_slope_6 = -15; + sampler_8x8[index].dw13.limiter_boost = 0; + sampler_8x8[index].dw13.minimum_limiter = 10; + sampler_8x8[index].dw13.maximum_limiter = 11; + sampler_8x8[index].dw14.clip_limiter = 130; + dri_bo_emit_reloc(pp_context->sampler_state_table.bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + sizeof(*sampler_8x8) * index + offsetof(struct i965_sampler_8x8, dw1), + pp_context->sampler_state_table.bo_8x8); + + dri_bo_map(pp_context->sampler_state_table.bo_8x8_uv, True); + assert(pp_context->sampler_state_table.bo_8x8_uv->virtual); + assert(sizeof(*sampler_8x8_state) == sizeof(int) * 138); + sampler_8x8_state = pp_context->sampler_state_table.bo_8x8_uv->virtual; + memset(sampler_8x8_state, 0, sizeof(*sampler_8x8_state)); + sampler_8x8_state->dw136.default_sharpness_level = 0; + sampler_8x8_state->dw137.adaptive_filter_for_all_channel = 0; + sampler_8x8_state->dw137.bypass_y_adaptive_filtering = 1; + sampler_8x8_state->dw137.bypass_x_adaptive_filtering = 1; + dri_bo_unmap(pp_context->sampler_state_table.bo_8x8_uv); + + /* sample_8x8 UV index 2 */ + index = 2; + memset(&sampler_8x8[index], 0, sizeof(*sampler_8x8)); + sampler_8x8[index].dw0.avs_filter_type = AVS_FILTER_NEAREST; + sampler_8x8[index].dw0.ief_bypass = 0; + sampler_8x8[index].dw0.ief_filter_type = IEF_FILTER_DETAIL; + sampler_8x8[index].dw0.ief_filter_size = IEF_FILTER_SIZE_5X5; + sampler_8x8[index].dw1.sampler_8x8_state_pointer = pp_context->sampler_state_table.bo_8x8_uv->offset >> 5; + sampler_8x8[index].dw2.global_noise_estimation = 22; + sampler_8x8[index].dw2.strong_edge_threshold = 8; + sampler_8x8[index].dw2.weak_edge_threshold = 1; + sampler_8x8[index].dw3.strong_edge_weight = 7; + sampler_8x8[index].dw3.regular_weight = 2; + sampler_8x8[index].dw3.non_edge_weight = 0; + sampler_8x8[index].dw3.gain_factor = 40; + sampler_8x8[index].dw4.steepness_boost = 0; + sampler_8x8[index].dw4.steepness_threshold = 0; + sampler_8x8[index].dw4.mr_boost = 0; + sampler_8x8[index].dw4.mr_threshold = 5; + sampler_8x8[index].dw5.pwl1_point_1 = 4; + sampler_8x8[index].dw5.pwl1_point_2 = 12; + sampler_8x8[index].dw5.pwl1_point_3 = 16; + sampler_8x8[index].dw5.pwl1_point_4 = 26; + sampler_8x8[index].dw6.pwl1_point_5 = 40; + sampler_8x8[index].dw6.pwl1_point_6 = 160; + sampler_8x8[index].dw6.pwl1_r3_bias_0 = 127; + sampler_8x8[index].dw6.pwl1_r3_bias_1 = 98; + sampler_8x8[index].dw7.pwl1_r3_bias_2 = 88; + sampler_8x8[index].dw7.pwl1_r3_bias_3 = 64; + sampler_8x8[index].dw7.pwl1_r3_bias_4 = 44; + sampler_8x8[index].dw7.pwl1_r3_bias_5 = 0; + sampler_8x8[index].dw8.pwl1_r3_bias_6 = 0; + sampler_8x8[index].dw8.pwl1_r5_bias_0 = 3; + sampler_8x8[index].dw8.pwl1_r5_bias_1 = 32; + sampler_8x8[index].dw8.pwl1_r5_bias_2 = 32; + sampler_8x8[index].dw9.pwl1_r5_bias_3 = 58; + sampler_8x8[index].dw9.pwl1_r5_bias_4 = 100; + sampler_8x8[index].dw9.pwl1_r5_bias_5 = 108; + sampler_8x8[index].dw9.pwl1_r5_bias_6 = 88; + sampler_8x8[index].dw10.pwl1_r3_slope_0 = -116; + sampler_8x8[index].dw10.pwl1_r3_slope_1 = -20; + sampler_8x8[index].dw10.pwl1_r3_slope_2 = -96; + sampler_8x8[index].dw10.pwl1_r3_slope_3 = -32; + sampler_8x8[index].dw11.pwl1_r3_slope_4 = -50; + sampler_8x8[index].dw11.pwl1_r3_slope_5 = 0; + sampler_8x8[index].dw11.pwl1_r3_slope_6 = 0; + sampler_8x8[index].dw11.pwl1_r5_slope_0 = 116; + sampler_8x8[index].dw12.pwl1_r5_slope_1 = 0; + sampler_8x8[index].dw12.pwl1_r5_slope_2 = 114; + sampler_8x8[index].dw12.pwl1_r5_slope_3 = 67; + sampler_8x8[index].dw12.pwl1_r5_slope_4 = 9; + sampler_8x8[index].dw13.pwl1_r5_slope_5 = -3; + sampler_8x8[index].dw13.pwl1_r5_slope_6 = -15; + sampler_8x8[index].dw13.limiter_boost = 0; + sampler_8x8[index].dw13.minimum_limiter = 10; + sampler_8x8[index].dw13.maximum_limiter = 11; + sampler_8x8[index].dw14.clip_limiter = 130; + dri_bo_emit_reloc(pp_context->sampler_state_table.bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + sizeof(*sampler_8x8) * index + offsetof(struct i965_sampler_8x8, dw1), + pp_context->sampler_state_table.bo_8x8_uv); + + dri_bo_unmap(pp_context->sampler_state_table.bo); + + /* private function & data */ + pp_context->pp_x_steps = pp_avs_x_steps; + pp_context->pp_y_steps = pp_avs_y_steps; + pp_context->pp_set_block_parameter = pp_avs_set_block_parameter; + + pp_avs_context->dest_w = pp_out_w; + pp_avs_context->dest_h = pp_out_h; + pp_avs_context->src_w = w; + pp_avs_context->src_h = h; + + ironlake_pp_static_parameter.grf4.r4_2.avs.nlas = 1; + ironlake_pp_static_parameter.grf1.r1_6.normalized_video_y_scaling_step = (float) 1.0 / pp_out_h; + ironlake_pp_inline_parameter.grf5.normalized_video_x_scaling_step = (float) 1.0 / pp_out_w; + ironlake_pp_inline_parameter.grf5.block_count_x = 1; /* M x 1 */ + ironlake_pp_inline_parameter.grf5.number_blocks = pp_out_h / 8; + ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff; + ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff; + ironlake_pp_inline_parameter.grf6.video_step_delta = 0.0; +} + +static int +pp_dndi_x_steps(void *private_context) +{ + return 1; +} + +static int +pp_dndi_y_steps(void *private_context) +{ + struct pp_dndi_context *pp_dndi_context = private_context; + + return pp_dndi_context->dest_h / 4; +} + +static int +pp_dndi_set_block_parameter(void *private_context, int x, int y) +{ + ironlake_pp_inline_parameter.grf5.destination_block_horizontal_origin = x * 16; + ironlake_pp_inline_parameter.grf5.destination_block_vertical_origin = y * 4; + + return 0; +} + +static +void ironlake_pp_nv12_dndi_initialize(VADriverContextP ctx, VASurfaceID surface, int input, + unsigned short srcw, unsigned short srch, + unsigned short destw, unsigned short desth) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct pp_dndi_context *pp_dndi_context = (struct pp_dndi_context *)&pp_context->private_context; + struct object_surface *obj_surface; + struct i965_surface_state *ss; + struct i965_surface_state2 *ss_dndi; + struct i965_sampler_dndi *sampler_dndi; + dri_bo *bo; + int index; + int w, h; + int orig_w, orig_h; + + /* surface */ + obj_surface = SURFACE(surface); + orig_w = obj_surface->orig_width; + orig_h = obj_surface->orig_height; + w = obj_surface->width; + h = obj_surface->height; + + if (pp_context->stmm.bo == NULL) { + pp_context->stmm.bo = dri_bo_alloc(i965->intel.bufmgr, + "STMM surface", + w * h, + 4096); + assert(pp_context->stmm.bo); + } + + dri_bo_unreference(obj_surface->pp_out_bo); + obj_surface->pp_out_bo = dri_bo_alloc(i965->intel.bufmgr, + "intermediate surface", + SIZE_YUV420(w, h), + 4096); + assert(obj_surface->pp_out_bo); + obj_surface->orig_pp_out_width = orig_w; + obj_surface->orig_pp_out_height = orig_h; + obj_surface->pp_out_width = w; + obj_surface->pp_out_height = h; + + /* source UV surface index 2 */ + index = 2; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h; + ss->ss2.width = orig_w / 4 - 1; + ss->ss2.height = orig_h / 2 - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + w * h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* source YUV surface index 4 */ + index = 4; + pp_context->surfaces[index].s_bo = obj_surface->bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "YUV surface state for deinterlace ", + sizeof(struct i965_surface_state2), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss_dndi = bo->virtual; + memset(ss_dndi, 0, sizeof(*ss_dndi)); + ss_dndi->ss0.surface_base_address = pp_context->surfaces[index].s_bo->offset; + ss_dndi->ss1.cbcr_pixel_offset_v_direction = 0; + ss_dndi->ss1.width = w - 1; + ss_dndi->ss1.height = h - 1; + ss_dndi->ss1.cbcr_pixel_offset_v_direction = 1; + ss_dndi->ss2.half_pitch_for_chroma = 0; + ss_dndi->ss2.pitch = w - 1; + ss_dndi->ss2.interleave_chroma = 1; + ss_dndi->ss2.surface_format = SURFACE_FORMAT_PLANAR_420_8; + ss_dndi->ss2.half_pitch_for_chroma = 0; + ss_dndi->ss2.tiled_surface = 0; + ss_dndi->ss3.x_offset_for_cb = 0; + ss_dndi->ss3.y_offset_for_cb = h; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + 0, + 0, + offsetof(struct i965_surface_state2, ss0), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* source STMM surface index 20 */ + index = 20; + pp_context->surfaces[index].s_bo = pp_context->stmm.bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "STMM surface state for deinterlace ", + sizeof(struct i965_surface_state2), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = w - 1; + ss->ss2.height = h - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination Y surface index 7 */ + index = 7; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset; + ss->ss2.width = w / 4 - 1; + ss->ss2.height = h - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* destination UV surface index 8 */ + index = 8; + pp_context->surfaces[index].s_bo = obj_surface->pp_out_bo; + dri_bo_reference(pp_context->surfaces[index].s_bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "surface state", + sizeof(struct i965_surface_state), + 4096); + assert(bo); + pp_context->surfaces[index].ss_bo = bo; + dri_bo_map(bo, True); + assert(bo->virtual); + ss = bo->virtual; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_2D; + ss->ss0.surface_format = I965_SURFACEFORMAT_R8G8_UNORM; + ss->ss1.base_addr = pp_context->surfaces[index].s_bo->offset + w * h; + ss->ss2.width = w / 4 - 1; + ss->ss2.height = h / 2 - 1; + ss->ss3.pitch = w - 1; + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + w * h, + offsetof(struct i965_surface_state, ss1), + pp_context->surfaces[index].s_bo); + dri_bo_unmap(bo); + + /* sampler dndi */ + dri_bo_map(pp_context->sampler_state_table.bo, True); + assert(pp_context->sampler_state_table.bo->virtual); + assert(sizeof(*sampler_dndi) == sizeof(int) * 8); + sampler_dndi = pp_context->sampler_state_table.bo->virtual; + + /* sample dndi index 1 */ + index = 0; + sampler_dndi[index].dw0.denoise_asd_threshold = 0; + sampler_dndi[index].dw0.denoise_history_delta = 8; // 0-15, default is 8 + sampler_dndi[index].dw0.denoise_maximum_history = 128; // 128-240 + sampler_dndi[index].dw0.denoise_stad_threshold = 0; + + sampler_dndi[index].dw1.denoise_threshold_for_sum_of_complexity_measure = 64; + sampler_dndi[index].dw1.denoise_moving_pixel_threshold = 0; + sampler_dndi[index].dw1.stmm_c2 = 0; + sampler_dndi[index].dw1.low_temporal_difference_threshold = 8; + sampler_dndi[index].dw1.temporal_difference_threshold = 16; + + sampler_dndi[index].dw2.block_noise_estimate_noise_threshold = 15; // 0-31 + sampler_dndi[index].dw2.block_noise_estimate_edge_threshold = 7; // 0-15 + sampler_dndi[index].dw2.denoise_edge_threshold = 7; // 0-15 + sampler_dndi[index].dw2.good_neighbor_threshold = 7; // 0-63 + + sampler_dndi[index].dw3.maximum_stmm = 128; + sampler_dndi[index].dw3.multipler_for_vecm = 2; + sampler_dndi[index].dw3.blending_constant_across_time_for_small_values_of_stmm = 0; + sampler_dndi[index].dw3.blending_constant_across_time_for_large_values_of_stmm = 64; + sampler_dndi[index].dw3.stmm_blending_constant_select = 0; + + sampler_dndi[index].dw4.sdi_delta = 8; + sampler_dndi[index].dw4.sdi_threshold = 128; + sampler_dndi[index].dw4.stmm_output_shift = 7; // stmm_max - stmm_min = 2 ^ stmm_output_shift + sampler_dndi[index].dw4.stmm_shift_up = 0; + sampler_dndi[index].dw4.stmm_shift_down = 0; + sampler_dndi[index].dw4.minimum_stmm = 0; + + sampler_dndi[index].dw5.fmd_temporal_difference_threshold = 0; + sampler_dndi[index].dw5.sdi_fallback_mode_2_constant = 0; + sampler_dndi[index].dw5.sdi_fallback_mode_1_t2_constant = 0; + sampler_dndi[index].dw5.sdi_fallback_mode_1_t1_constant = 0; + + sampler_dndi[index].dw6.dn_enable = 1; + sampler_dndi[index].dw6.di_enable = 1; + sampler_dndi[index].dw6.di_partial = 0; + sampler_dndi[index].dw6.dndi_top_first = 1; + sampler_dndi[index].dw6.dndi_stream_id = 1; + sampler_dndi[index].dw6.dndi_first_frame = 1; + sampler_dndi[index].dw6.progressive_dn = 0; + sampler_dndi[index].dw6.fmd_tear_threshold = 32; + sampler_dndi[index].dw6.fmd2_vertical_difference_threshold = 32; + sampler_dndi[index].dw6.fmd1_vertical_difference_threshold = 32; + + sampler_dndi[index].dw7.fmd_for_1st_field_of_current_frame = 2; + sampler_dndi[index].dw7.fmd_for_2nd_field_of_previous_frame = 1; + sampler_dndi[index].dw7.vdi_walker_enable = 0; + sampler_dndi[index].dw7.column_width_minus1 = w / 16; + + dri_bo_unmap(pp_context->sampler_state_table.bo); + + /* private function & data */ + pp_context->pp_x_steps = pp_dndi_x_steps; + pp_context->pp_y_steps = pp_dndi_y_steps; + pp_context->pp_set_block_parameter = pp_dndi_set_block_parameter; + + ironlake_pp_static_parameter.grf1.statistics_surface_picth = w / 2; + ironlake_pp_static_parameter.grf1.r1_6.di.top_field_first = 0; + ironlake_pp_static_parameter.grf4.r4_2.di.motion_history_coefficient_m2 = 64; + ironlake_pp_static_parameter.grf4.r4_2.di.motion_history_coefficient_m1 = 192; + + ironlake_pp_inline_parameter.grf5.block_count_x = w / 16; /* 1 x N */ + ironlake_pp_inline_parameter.grf5.number_blocks = w / 16; + ironlake_pp_inline_parameter.grf5.block_vertical_mask = 0xff; + ironlake_pp_inline_parameter.grf5.block_horizontal_mask = 0xffff; + + pp_dndi_context->dest_w = w; + pp_dndi_context->dest_h = h; +} + +static void +ironlake_pp_initialize(VADriverContextP ctx, + VASurfaceID surface, + int input, + short srcx, + short srcy, + unsigned short srcw, + unsigned short srch, + short destx, + short desty, + unsigned short destw, + unsigned short desth, + int pp_index) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + struct pp_module *pp_module; + dri_bo *bo; + int i; + + dri_bo_unreference(pp_context->curbe.bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "constant buffer", + 4096, + 4096); + assert(bo); + pp_context->curbe.bo = bo; + + dri_bo_unreference(pp_context->binding_table.bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "binding table", + sizeof(unsigned int), + 4096); + assert(bo); + pp_context->binding_table.bo = bo; + + dri_bo_unreference(pp_context->idrt.bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "interface discriptor", + sizeof(struct i965_interface_descriptor), + 4096); + assert(bo); + pp_context->idrt.bo = bo; + + dri_bo_unreference(pp_context->sampler_state_table.bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "sampler state table", + 4096, + 4096); + assert(bo); + dri_bo_map(bo, True); + memset(bo->virtual, 0, bo->size); + dri_bo_unmap(bo); + pp_context->sampler_state_table.bo = bo; + + dri_bo_unreference(pp_context->sampler_state_table.bo_8x8); + bo = dri_bo_alloc(i965->intel.bufmgr, + "sampler 8x8 state ", + 4096, + 4096); + assert(bo); + pp_context->sampler_state_table.bo_8x8 = bo; + + dri_bo_unreference(pp_context->sampler_state_table.bo_8x8_uv); + bo = dri_bo_alloc(i965->intel.bufmgr, + "sampler 8x8 state ", + 4096, + 4096); + assert(bo); + pp_context->sampler_state_table.bo_8x8_uv = bo; + + dri_bo_unreference(pp_context->vfe_state.bo); + bo = dri_bo_alloc(i965->intel.bufmgr, + "vfe state", + sizeof(struct i965_vfe_state), + 4096); + assert(bo); + pp_context->vfe_state.bo = bo; + + for (i = 0; i < MAX_PP_SURFACES; i++) { + dri_bo_unreference(pp_context->surfaces[i].ss_bo); + pp_context->surfaces[i].ss_bo = NULL; + + dri_bo_unreference(pp_context->surfaces[i].s_bo); + pp_context->surfaces[i].s_bo = NULL; + } + + memset(&ironlake_pp_static_parameter, 0, sizeof(ironlake_pp_static_parameter)); + memset(&ironlake_pp_inline_parameter, 0, sizeof(ironlake_pp_inline_parameter)); + assert(pp_index >= PP_NULL && pp_index < NUM_PP_MODULES); + assert(pp_modules); + pp_context->current_pp = pp_index; + pp_module = &pp_modules[pp_index]; + + if (pp_module->initialize) + pp_module->initialize(ctx, surface, input, srcw, srch, destw, desth); +} + +static void +i965_post_processing_internal(VADriverContextP ctx, + VASurfaceID surface, + int input, + short srcx, + short srcy, + unsigned short srcw, + unsigned short srch, + short destx, + short desty, + unsigned short destw, + unsigned short desth, + int pp_index) +{ + ironlake_pp_initialize(ctx, surface, input, + srcx, srcy, srcw, srch, + destx, desty, destw, desth, + pp_index); + ironlake_pp_states_setup(ctx); + ironlake_pp_pipeline_setup(ctx); +} + +void +i965_post_processing(VADriverContextP ctx, + VASurfaceID surface, + short srcx, + short srcy, + unsigned short srcw, + unsigned short srch, + short destx, + short desty, + unsigned short destw, + unsigned short desth, + unsigned int flag) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + + if (IS_IRONLAKE(i965->intel.device_id)) { + /* Currently only support post processing for NV12 surface */ + if (i965->render_state.interleaved_uv) { + int input = 0; + + if (flag & I965_PP_FLAG_DEINTERLACING) { + i965_post_processing_internal(ctx, surface, input, + srcx, srcy, srcw, srch, + destx, desty, destw, desth, + PP_NV12_DNDI); + input = 1; + } + + if (flag & I965_PP_FLAG_AVS) { + i965_post_processing_internal(ctx, surface, input, + srcx, srcy, srcw, srch, + destx, desty, destw, desth, + PP_NV12_AVS); + } + } + } +} + +void +i965_post_processing_once_init(VADriverContextP ctx) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + int i; + + pp_context->urb.size = URB_SIZE((&i965->intel)); + pp_context->urb.num_vfe_entries = 32; + pp_context->urb.size_vfe_entry = 1; + pp_context->urb.num_cs_entries = 1; + pp_context->urb.size_cs_entry = 2; + pp_context->urb.vfe_start = 0; + pp_context->urb.cs_start = pp_context->urb.vfe_start + + pp_context->urb.num_vfe_entries * pp_context->urb.size_vfe_entry; + assert(pp_context->urb.cs_start + + pp_context->urb.num_cs_entries * pp_context->urb.size_cs_entry <= URB_SIZE((&i965->intel))); + + if (IS_IRONLAKE(i965->intel.device_id)) { + pp_modules = pp_modules_gen5; + } + + for (i = 0; i < NUM_PP_MODULES && pp_modules; i++) { + struct pp_module *pp_module = &pp_modules[i]; + pp_module->bo = dri_bo_alloc(i965->intel.bufmgr, + pp_module->name, + pp_module->size, + 4096); + assert(pp_module->bo); + dri_bo_subdata(pp_module->bo, 0, pp_module->size, pp_module->bin); + } +} + +Bool +i965_post_processing_terminate(VADriverContextP ctx) +{ + struct i965_driver_data *i965 = i965_driver_data(ctx); + struct i965_post_processing_context *pp_context = &i965->render_state.pp_context; + int i; + + dri_bo_unreference(pp_context->curbe.bo); + pp_context->curbe.bo = NULL; + + for (i = 0; i < MAX_PP_SURFACES; i++) { + dri_bo_unreference(pp_context->surfaces[i].ss_bo); + pp_context->surfaces[i].ss_bo = NULL; + + dri_bo_unreference(pp_context->surfaces[i].s_bo); + pp_context->surfaces[i].s_bo = NULL; + } + + dri_bo_unreference(pp_context->sampler_state_table.bo); + pp_context->sampler_state_table.bo = NULL; + + dri_bo_unreference(pp_context->sampler_state_table.bo_8x8); + pp_context->sampler_state_table.bo_8x8 = NULL; + + dri_bo_unreference(pp_context->sampler_state_table.bo_8x8_uv); + pp_context->sampler_state_table.bo_8x8_uv = NULL; + + dri_bo_unreference(pp_context->binding_table.bo); + pp_context->binding_table.bo = NULL; + + dri_bo_unreference(pp_context->idrt.bo); + pp_context->idrt.bo = NULL; + + dri_bo_unreference(pp_context->vfe_state.bo); + pp_context->vfe_state.bo = NULL; + + dri_bo_unreference(pp_context->stmm.bo); + pp_context->stmm.bo = NULL; + + for (i = 0; i < NUM_PP_MODULES && pp_modules; i++) { + struct pp_module *pp_module = &pp_modules[i]; + + dri_bo_unreference(pp_module->bo); + pp_module->bo = NULL; + } + + return True; +} diff --git a/i965_drv_video/i965_post_processing.h b/i965_drv_video/i965_post_processing.h new file mode 100644 index 0000000..360ded4 --- /dev/null +++ b/i965_drv_video/i965_post_processing.h @@ -0,0 +1,150 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Xiang Haihao <haihao.xiang@intel.com> + * + */ + +#ifndef __I965_POST_PROCESSING_H__ +#define __I965_POST_PROCESSING_H__ + +#define MAX_PP_SURFACES 32 + +#define I965_PP_FLAG_DEINTERLACING 1 +#define I965_PP_FLAG_AVS 2 + +enum +{ + PP_NULL = 0, + PP_NV12_LOAD_SAVE, + PP_NV12_SCALING, + PP_NV12_AVS, + PP_NV12_DNDI, +}; + +struct pp_load_save_context +{ + int dest_w; + int dest_h; +}; + +struct pp_scaling_context +{ + int dest_w; + int dest_h; +}; + +struct pp_avs_context +{ + int dest_w; + int dest_h; + int src_w; + int src_h; +}; + +struct pp_dndi_context +{ + int dest_w; + int dest_h; + +}; + +struct i965_post_processing_context +{ + int current_pp; + + struct { + dri_bo *bo; + } curbe; + + struct { + dri_bo *ss_bo; + dri_bo *s_bo; + } surfaces[MAX_PP_SURFACES]; + + struct { + dri_bo *bo; + } binding_table; + + struct { + dri_bo *bo; + } idrt; + + struct { + dri_bo *bo; + } vfe_state; + + struct { + dri_bo *bo; + dri_bo *bo_8x8; + dri_bo *bo_8x8_uv; + } sampler_state_table; + + struct { + unsigned int size; + + unsigned int vfe_start; + unsigned int cs_start; + + unsigned int num_vfe_entries; + unsigned int num_cs_entries; + + unsigned int size_vfe_entry; + unsigned int size_cs_entry; + } urb; + + struct { + dri_bo *bo; + } stmm; + + union { + struct pp_load_save_context pp_load_save_context; + struct pp_scaling_context pp_scaling_context; + struct pp_avs_context pp_avs_context; + struct pp_dndi_context pp_dndi_context; + } private_context; + + int (*pp_x_steps)(void *private_context); + int (*pp_y_steps)(void *private_context); + int (*pp_set_block_parameter)(void *private_context, int x, int y); +}; + +void +i965_post_processing(VADriverContextP ctx, + VASurfaceID surface, + short srcx, + short srcy, + unsigned short srcw, + unsigned short srch, + short destx, + short desty, + unsigned short destw, + unsigned short desth, + unsigned int pp_index); +void +i965_post_processing_once_init(VADriverContextP ctx); +Bool +i965_post_processing_terminate(VADriverContextP ctx); + +#endif /* __I965_POST_PROCESSING_H__ */ diff --git a/i965_drv_video/i965_render.c b/i965_drv_video/i965_render.c index c4e8ed8..ceef319 100644 --- a/i965_drv_video/i965_render.c +++ b/i965_drv_video/i965_render.c @@ -655,12 +655,20 @@ i965_render_src_surfaces_state(VADriverContextP ctx, obj_surface = SURFACE(surface); assert(obj_surface); - assert(obj_surface->bo); - w = obj_surface->width; - h = obj_surface->height; - rw = obj_surface->orig_width; - rh = obj_surface->orig_height; - region = obj_surface->bo; + + if (obj_surface->pp_out_bo) { + w = obj_surface->pp_out_width; + h = obj_surface->pp_out_height; + rw = obj_surface->orig_pp_out_width; + rh = obj_surface->orig_pp_out_height; + region = obj_surface->pp_out_bo; + } else { + w = obj_surface->width; + h = obj_surface->height; + rw = obj_surface->orig_width; + rh = obj_surface->orig_height; + region = obj_surface->bo; + } i965_render_src_surface_state(ctx, 1, region, 0, rw, rh, w, I965_SURFACEFORMAT_R8_UNORM); /* Y */ i965_render_src_surface_state(ctx, 2, region, 0, rw, rh, w, I965_SURFACEFORMAT_R8_UNORM); @@ -1454,8 +1462,14 @@ i965_render_put_surface(VADriverContextP ctx, short destx, short desty, unsigned short destw, - unsigned short desth) + unsigned short desth, + unsigned int flag) { + i965_post_processing(ctx, surface, + srcx, srcy, srcw, srch, + destx, desty, destw, desth, + flag); + i965_render_initialize(ctx); i965_surface_render_state_setup(ctx, surface, srcx, srcy, srcw, srch, @@ -1523,6 +1537,8 @@ i965_render_init(VADriverContextP ctx) assert(render_state->curbe.bo); render_state->curbe.upload = 0; + i965_post_processing_once_init(ctx); + return True; } @@ -1533,6 +1549,8 @@ i965_render_terminate(VADriverContextP ctx) struct i965_driver_data *i965 = i965_driver_data(ctx); struct i965_render_state *render_state = &i965->render_state; + i965_post_processing_terminate(ctx); + dri_bo_unreference(render_state->curbe.bo); render_state->curbe.bo = NULL; diff --git a/i965_drv_video/i965_render.h b/i965_drv_video/i965_render.h index 9abb81f..84b50f2 100644 --- a/i965_drv_video/i965_render.h +++ b/i965_drv_video/i965_render.h @@ -31,6 +31,8 @@ #define MAX_RENDER_SURFACES 16 #define MAX_SAMPLERS 16 +#include "i965_post_processing.h" + struct i965_render_state { struct { @@ -65,6 +67,9 @@ struct i965_render_state int interleaved_uv; struct intel_region *draw_region; + + int pp_flag; /* 0: disable, 1: enable */ + struct i965_post_processing_context pp_context; }; Bool i965_render_init(VADriverContextP ctx); @@ -78,7 +83,8 @@ void i965_render_put_surface(VADriverContextP ctx, short destx, short desty, unsigned short destw, - unsigned short desth); + unsigned short desth, + unsigned int flag); void diff --git a/i965_drv_video/i965_structs.h b/i965_drv_video/i965_structs.h index d133446..f8be616 100644 --- a/i965_drv_video/i965_structs.h +++ b/i965_drv_video/i965_structs.h @@ -639,4 +639,329 @@ struct i965_cc_unit_state } cc7; }; +struct i965_sampler_8x8 +{ + struct { + unsigned int pad0:16; + unsigned int chroma_key_index:2; + unsigned int chroma_key_enable:1; + unsigned int pad1:8; + unsigned int ief_filter_size:1; + unsigned int ief_filter_type:1; + unsigned int ief_bypass:1; + unsigned int pad2:1; + unsigned int avs_filter_type:1; + } dw0; + + struct { + unsigned int pad0:5; + unsigned int sampler_8x8_state_pointer:27; + } dw1; + + struct { + unsigned int weak_edge_threshold:4; + unsigned int strong_edge_threshold:4; + unsigned int global_noise_estimation:8; + unsigned int pad0:16; + } dw2; + + struct { + unsigned int r3x_coefficient:5; + unsigned int pad0:1; + unsigned int r3c_coefficient:5; + unsigned int pad1:3; + unsigned int gain_factor:6; + unsigned int non_edge_weight:3; + unsigned int pad2:1; + unsigned int regular_weight:3; + unsigned int pad3:1; + unsigned int strong_edge_weight:3; + unsigned int pad4:1; + } dw3; + + struct { + unsigned int pad0:2; + unsigned int mr_boost:1; + unsigned int mr_threshold:4; + unsigned int steepness_boost:1; + unsigned int steepness_threshold:4; + unsigned int pad1:2; + unsigned int r5x_coefficient:5; + unsigned int pad2:1; + unsigned int r5cx_coefficient:5; + unsigned int pad3:1; + unsigned int r5c_coefficient:5; + unsigned int pad4:1; + } dw4; + + struct { + unsigned int pwl1_point_1:8; + unsigned int pwl1_point_2:8; + unsigned int pwl1_point_3:8; + unsigned int pwl1_point_4:8; + } dw5; + + struct { + unsigned int pwl1_point_5:8; + unsigned int pwl1_point_6:8; + unsigned int pwl1_r3_bias_0:8; + unsigned int pwl1_r3_bias_1:8; + } dw6; + + struct { + unsigned int pwl1_r3_bias_2:8; + unsigned int pwl1_r3_bias_3:8; + unsigned int pwl1_r3_bias_4:8; + unsigned int pwl1_r3_bias_5:8; + } dw7; + + struct { + unsigned int pwl1_r3_bias_6:8; + unsigned int pwl1_r5_bias_0:8; + unsigned int pwl1_r5_bias_1:8; + unsigned int pwl1_r5_bias_2:8; + } dw8; + + struct { + unsigned int pwl1_r5_bias_3:8; + unsigned int pwl1_r5_bias_4:8; + unsigned int pwl1_r5_bias_5:8; + unsigned int pwl1_r5_bias_6:8; + } dw9; + + struct { + int pwl1_r3_slope_0:8; + int pwl1_r3_slope_1:8; + int pwl1_r3_slope_2:8; + int pwl1_r3_slope_3:8; + } dw10; + + struct { + int pwl1_r3_slope_4:8; + int pwl1_r3_slope_5:8; + int pwl1_r3_slope_6:8; + int pwl1_r5_slope_0:8; + } dw11; + + struct { + int pwl1_r5_slope_1:8; + int pwl1_r5_slope_2:8; + int pwl1_r5_slope_3:8; + int pwl1_r5_slope_4:8; + } dw12; + + struct { + int pwl1_r5_slope_5:8; + int pwl1_r5_slope_6:8; + unsigned int limiter_boost:4; + unsigned int pad0:4; + unsigned int minimum_limiter:4; + unsigned int maximum_limiter:4; + } dw13; + + struct { + unsigned int pad0:8; + unsigned int clip_limiter:10; + unsigned int pad1:14; + } dw14; + + unsigned int dw15; /* Just a pad */ +}; + +struct i965_sampler_8x8_coefficient +{ + struct { + int table_0x_filter_c0:8; + int table_0x_filter_c1:8; + int table_0x_filter_c2:8; + int table_0x_filter_c3:8; + } dw0; + + struct { + int table_0x_filter_c4:8; + int table_0x_filter_c5:8; + int table_0x_filter_c6:8; + int table_0x_filter_c7:8; + } dw1; + + struct { + int table_0y_filter_c0:8; + int table_0y_filter_c1:8; + int table_0y_filter_c2:8; + int table_0y_filter_c3:8; + } dw2; + + struct { + int table_0y_filter_c4:8; + int table_0y_filter_c5:8; + int table_0y_filter_c6:8; + int table_0y_filter_c7:8; + } dw3; + + struct { + int pad0:16; + int table_1x_filter_c2:8; + int table_1x_filter_c3:8; + } dw4; + + struct { + int table_1x_filter_c4:8; + int table_1x_filter_c5:8; + int pad0:16; + } dw5; + + struct { + int pad0:16; + int table_1y_filter_c2:8; + int table_1y_filter_c3:8; + } dw6; + + struct { + int table_1y_filter_c4:8; + int table_1y_filter_c5:8; + int pad0:16; + } dw7; +}; + +struct i965_sampler_8x8_state +{ + struct i965_sampler_8x8_coefficient coefficients[17]; + + struct { + unsigned int transition_area_with_8_pixels:3; + unsigned int pad0:1; + unsigned int transition_area_with_4_pixels:3; + unsigned int pad1:1; + unsigned int max_derivative_8_pixels:8; + unsigned int max_derivative_4_pixels:8; + unsigned int default_sharpness_level:8; + } dw136; + + struct { + unsigned int bit_field_name:1; + unsigned int adaptive_filter_for_all_channel:1; + unsigned int pad0:19; + unsigned int bypass_y_adaptive_filtering:1; + unsigned int bypass_x_adaptive_filtering:1; + unsigned int pad1:9; + } dw137; +}; + +struct i965_surface_state2 +{ + struct { + unsigned int surface_base_address; + } ss0; + + struct { + unsigned int cbcr_pixel_offset_v_direction:2; + unsigned int pad0:4; + unsigned int width:13; + unsigned int height:13; + } ss1; + + struct { + unsigned int tile_walk:1; + unsigned int tiled_surface:1; + unsigned int half_pitch_for_chroma:1; + unsigned int pitch:17; + unsigned int pad0:2; + unsigned int surface_object_control_data:4; + unsigned int pad1:1; + unsigned int interleave_chroma:1; + unsigned int surface_format:4; + } ss2; + + struct { + unsigned int y_offset_for_cb:13; + unsigned int pad0:3; + unsigned int x_offset_for_cb:13; + unsigned int pad1:3; + } ss3; + + struct { + unsigned int y_offset_for_cr:13; + unsigned int pad0:3; + unsigned int x_offset_for_cr:13; + unsigned int pad1:3; + } ss4; +}; + +struct i965_sampler_dndi +{ + struct { + unsigned int denoise_asd_threshold:8; + unsigned int denoise_history_delta:8; + unsigned int denoise_maximum_history:8; + unsigned int denoise_stad_threshold:8; + } dw0; + + struct { + unsigned int denoise_threshold_for_sum_of_complexity_measure:8; + unsigned int denoise_moving_pixel_threshold:5; + unsigned int stmm_c2:3; + unsigned int low_temporal_difference_threshold:6; + unsigned int pad0:2; + unsigned int temporal_difference_threshold:6; + unsigned int pad1:2; + } dw1; + + struct { + unsigned int block_noise_estimate_noise_threshold:8; + unsigned int block_noise_estimate_edge_threshold:8; + unsigned int denoise_edge_threshold:8; + unsigned int good_neighbor_threshold:8; + } dw2; + + struct { + unsigned int maximum_stmm:8; + unsigned int multipler_for_vecm:6; + unsigned int pad0:2; + unsigned int blending_constant_across_time_for_small_values_of_stmm:8; + unsigned int blending_constant_across_time_for_large_values_of_stmm:7; + unsigned int stmm_blending_constant_select:1; + } dw3; + + struct { + unsigned int sdi_delta:8; + unsigned int sdi_threshold:8; + unsigned int stmm_output_shift:4; + unsigned int stmm_shift_up:2; + unsigned int stmm_shift_down:2; + unsigned int minimum_stmm:8; + } dw4; + + struct { + unsigned int fmd_temporal_difference_threshold:8; + unsigned int sdi_fallback_mode_2_constant:8; + unsigned int sdi_fallback_mode_1_t2_constant:8; + unsigned int sdi_fallback_mode_1_t1_constant:8; + } dw5; + + struct { + unsigned int dn_enable:1; + unsigned int di_enable:1; + unsigned int di_partial:1; + unsigned int dndi_top_first:1; + unsigned int dndi_stream_id:1; + unsigned int dndi_first_frame:1; + unsigned int progressive_dn:1; + unsigned int pad0:1; + unsigned int fmd_tear_threshold:6; + unsigned int pad1:2; + unsigned int fmd2_vertical_difference_threshold:8; + unsigned int fmd1_vertical_difference_threshold:8; + } dw6; + + struct { + unsigned int pad0:8; + unsigned int fmd_for_1st_field_of_current_frame:2; + unsigned int pad1:6; + unsigned int fmd_for_2nd_field_of_previous_frame:2; + unsigned int vdi_walker_enable:1; + unsigned int pad2:4; + unsigned int column_width_minus1:9; + } dw7; +}; + #endif /* _I965_STRUCTS_H_ */ diff --git a/i965_drv_video/intel_batchbuffer.c b/i965_drv_video/intel_batchbuffer.c index abe548e..15c3201 100644 --- a/i965_drv_video/intel_batchbuffer.c +++ b/i965_drv_video/intel_batchbuffer.c @@ -37,7 +37,7 @@ static void intel_batchbuffer_reset(struct intel_batchbuffer *batch) { struct intel_driver_data *intel = batch->intel; - int batch_size = batch->flag == I915_EXEC_RENDER ? BATCH_SIZE : (BATCH_SIZE * 8); + int batch_size = BATCH_SIZE; assert(batch->flag == I915_EXEC_RENDER || batch->flag == I915_EXEC_BSD); diff --git a/i965_drv_video/intel_driver.h b/i965_drv_video/intel_driver.h index ffa8cad..1e2adfa 100644 --- a/i965_drv_video/intel_driver.h +++ b/i965_drv_video/intel_driver.h @@ -17,7 +17,7 @@ #define INLINE #endif -#define BATCH_SIZE 0x10000 +#define BATCH_SIZE 0x80000 #define BATCH_RESERVED 0x10 #define CMD_MI (0x0 << 29) diff --git a/i965_drv_video/shaders/Makefile.am b/i965_drv_video/shaders/Makefile.am index 2fd019b..e2b6223 100644 --- a/i965_drv_video/shaders/Makefile.am +++ b/i965_drv_video/shaders/Makefile.am @@ -1 +1 @@ -SUBDIRS = h264 mpeg2 render +SUBDIRS = h264 mpeg2 render post_processing diff --git a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm new file mode 100644 index 0000000..f6c3a33 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.asm @@ -0,0 +1,53 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: AYUV_Load_16x8.asm +//---------------------------------------------------------------- + + +#include "AYUV_Load_16x8.inc" + +// In order to load 64x8 AYUV data (16x8 pixels), we need to divide the data +// into two regions and load them separately. +// +// 32 byte 32 byte +//|----------------|----------------| +//| | | +//| A | B |8 +//| | | +//| | | +//|----------------|----------------| + +// Load the first 32x8 data block +// Packed data block should be loaded as 32x8 pixel block + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin + shl (1) rMSGSRC.0<1>:d acc0:w 2:w { NoDDClr } // H. block origin need to be four times larger + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV:ud { NoDDChk } // Block width and height (32x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud + +//Load the second 32x8 data block +// Offset the origin X - move to next 32 colomns + add (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 32:w // Increase X origin by 8 + +// Size stays the same - 32x8 + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud // Copy message description to message header + send (8) udSRC_YUV(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud + +// Give AYUV region addresses to address register + mov (1) SRC_YUV_OFFSET<1>:ud 0x00400038*32:ud //Address registers contain starting addresses of two halves + +//Directly move the data to destination + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(%1)<1> r[SRC_YUV_OFFSET,%1*32+2]<8,4>:ub + mov (16) uwDEST_U(%1)<1> r[SRC_YUV_OFFSET,%1*32+1]<8,4>:ub + mov (16) uwDEST_V(%1)<1> r[SRC_YUV_OFFSET,%1*32+0]<8,4>:ub + } +
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc new file mode 100644 index 0000000..422dfb3 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/AYUV_Load_16x8.inc @@ -0,0 +1,43 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: AYUV_Load_16x8.inc +// +// AYUV data are first loaded to bottom I/O REGION_2, then unpacked to planar data +// and stored in top I/O REGION_1 + +#undef nY_NUM_OF_ROWS + +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block + +#define nDPR_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Y block size 32x8 +#define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold Y block data (8) + +//Temporary storage for unpacked AYUV data +#define rUNPACK_TEMP REG(r,nTEMP0) +.declare udUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=4 SrcRegion=<8;8,1> Type=ud //1 GRF +.declare ubUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=1 SrcRegion=<32;32,1> Type=ub //1 GRF + +.declare ubBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub + + +#define udSRC_YUV udBOT_Y_IO +#define ubSRC_YUV ubBOT_Y_IO +#define nSRC_YUV_REG nBOT_Y + +#define uwDEST_Y uwTOP_Y +#define uwDEST_U uwTOP_U +#define uwDEST_V uwTOP_V + +#define SRC_YUV_OFFSET a0.0 + +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel + +// End of AYUV_Load_16x8.inc diff --git a/i965_drv_video/shaders/post_processing/Common/Expansion.inc b/i965_drv_video/shaders/post_processing/Common/Expansion.inc new file mode 100644 index 0000000..7f3d5aa --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/Expansion.inc @@ -0,0 +1,31 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: Expansion.inc +// Number of U/V rows per block definition +#undef nUV_NUM_OF_ROWS +#ifdef EXPAND_9x5 + #define nUV_NUM_OF_ROWS 6 +#else + #define nUV_NUM_OF_ROWS 8 +#endif + +// Source/destination region definitions +#undef uwDEST_U +#undef uwDEST_V +#if (nSRC_REGION==nREGION_1) + #define uwDEST_U uwTOP_U + #define uwDEST_V uwTOP_V +#elif (nSRC_REGION==nREGION_2) + #define uwDEST_U uwBOT_U + #define uwDEST_V uwBOT_V +#endif + +// End of Expansion.inc diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm new file mode 100644 index 0000000..2817175 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x4.asm @@ -0,0 +1,47 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: IMC3_Load_8x4.asm +// +//---------------------------------------------------------------- + +#define IMC3_LOAD_8x4 +#include "PL3_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8) + + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 8x4 planar U and V ----------------------------------------------------- + asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x4) + + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud + mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16) + mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16) + } + +// End of IMC3_Load_8x4 diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm new file mode 100644 index 0000000..3c96e72 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_8x5.asm @@ -0,0 +1,47 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: IMC3_Load_8x5.asm +// +//---------------------------------------------------------------- + +#define IMC3_LOAD_8x5 +#include "PL3_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8) + + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 8x5 planar U and V ----------------------------------------------------- + asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x5) + + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud + mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16) + mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16) + } + +// End of IMC3_Load_8x5 diff --git a/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm new file mode 100644 index 0000000..d286cbb --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/IMC3_Load_9x5.asm @@ -0,0 +1,50 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: IMC3_Load_9x5.asm +// +//---------------------------------------------------------------- +// This module loads 16x8 Y, 9x5 U and 9x5 V planar data blocks for CSC module +// and stores it in byte-aligned format. +//---------------------------------------------------------------- + +#define IMC3_LOAD_9x5 +#include "PL3_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Block width and height (16x8) + + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 9x5 planar U and V ----------------------------------------------------- + asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (12x5) + + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_U:ud + mov (8) mMSGHDRV<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_V(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_V:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for(nUV_NUM_OF_ROWS-2; >-1; -1) { + mov (16) uwDEST_U(0, %1*16)<1> ubSRC_U(0, %1*16) + mov (16) uwDEST_V(0, %1*16)<1> ubSRC_V(0, %1*16) + } + +// End of IMC3_Load_9x5 diff --git a/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm b/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm new file mode 100644 index 0000000..cb0fd41 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/Init_All_Regs.asm @@ -0,0 +1,18 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#ifdef GT // to remove error messages of un-initialized GRF + .declare udGRF_space Base=r0.0 ElementSize=4 SrcRegion=REGION(8,1) Type=ud + + $for (7; <80; 1) { + mov (8) udGRF_space(%1)<1> 0:ud + } +#else +#endif
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm new file mode 100644 index 0000000..8a9fd96 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop.asm @@ -0,0 +1,84 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +///////////////////////////////////////////////////////////////////////////////// +// Multiple_Loop.asm + + +// This lable is for satisfying component kernel build. +// DL will remove this label and reference the real one in Multiple_Loop_Head.asm. +#if defined(COMPONENT) +VIDEO_PROCESSING_LOOP: +#endif + + +//===== Possible build flags for component kernels +// 1) INC_SCALING +// 2) INC_BLENDING +// 3) INC_BLENDING and INC_SCALING +// 4) (no flags) + + +#define MxN_MULTIPLE_BLOCKS + +//------------------------------------------------------------------------------ +#if defined(MxN_MULTIPLE_BLOCKS) +// Do Multiple Block Processing ------------------------------------------------ + + // The 1st block has been processed before entering the loop + + // Processed all blocks? + add.z.f0.0 (1) wNUM_BLKS:w wNUM_BLKS:w -1:w + + // Reached multi-block width? + add (1) wORIX:w wORIX:w 16:w + cmp.l.f0.1 (1) null:w acc0.0:w wFRAME_ENDX:w // acc0.0 has wORIX + + #if defined(INC_SCALING) + // Update SRC_VID_H_ORI for scaling + mul (1) REG(r,nTEMP0):f fVIDEO_STEP_X:f 16.0:f + add (1) fSRC_VID_H_ORI:f REG(r,nTEMP0):f fSRC_VID_H_ORI:f + #endif + + #if defined(INC_BLENDING) + // Update SRC_ALPHA_H_ORI for blending + mul (1) REG(r,nTEMP0):f fALPHA_STEP_X:f 16.0:f + add (1) fSRC_ALPHA_H_ORI:f REG(r,nTEMP0):f fSRC_ALPHA_H_ORI:f + #endif + + (f0.0)jmpi (1) END_VIDEO_PROCESSING // All blocks are done - Exit loop + + (f0.1)jmpi (1) VIDEO_PROCESSING_LOOP // If not the end of row, goto the beginning of the loop + + //If end of row, restart Horizontal offset and calculate Vertical offsets next row. + mov (1) wORIX:w wCOPY_ORIX:w + add (1) wORIY:w wORIY:w 8:w + + #if defined(INC_SCALING) + // Update SRC_VID_H_ORI and SRC_VID_V_ORI for scaling + mov (1) fSRC_VID_H_ORI:f fFRAME_VID_ORIX:f // Reset normalised X origin to 0 for video and alpha + mul (1) REG(r,nTEMP0):f fVIDEO_STEP_Y:f 8.0:f + add (1) fSRC_VID_V_ORI:f REG(r,nTEMP0):f fSRC_VID_V_ORI:f + #endif + + #if defined(INC_BLENDING) + // Update SRC_ALPHA_H_ORI and SRC_ALPHA_V_ORI for blending + mov (1) fSRC_ALPHA_H_ORI:f fFRAME_ALPHA_ORIX:f // Reset normalised X origin to 0 for video and alpha + mul (1) REG(r,nTEMP0):f fALPHA_STEP_Y:f 8.0:f + add (1) fSRC_ALPHA_V_ORI:f REG(r,nTEMP0):f fSRC_ALPHA_V_ORI:f + #endif + + jmpi (1) VIDEO_PROCESSING_LOOP // Continue Loop + +END_VIDEO_PROCESSING: + nop + +#endif +END_THREAD // End of Thread
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm new file mode 100644 index 0000000..77d8b94 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/Multiple_Loop_Head.asm @@ -0,0 +1,23 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +////////////////////////////////////////////////////////////////////////////////// +// Multiple_Loop_Head.asm +// This code sets up the loop control for multiple blocks per thread + + mul (1) wFRAME_ENDX:w ubBLK_CNT_X:ub 16:uw { NoDDClr } // Build multi-block loop counters + mov (1) wNUM_BLKS:w ubNUM_BLKS:ub { NoDDClr, NoDDChk } // Copy num blocks to word variable + mov (1) wCOPY_ORIX:w wORIX:w { NoDDChk } // Copy multi-block origin in pixel + mov (2) fFRAME_VID_ORIX<1>:f fSRC_VID_H_ORI<4;2,2>:f // Copy src video origin for scaling, and alpha origin for blending + add (1) wFRAME_ENDX:w wFRAME_ENDX:w wORIX:w // Continue building multi-block loop counters + +VIDEO_PROCESSING_LOOP: // Loop back entry point as the biginning of the loop for multiple blocks + +// Beginning of the loop diff --git a/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm b/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm new file mode 100644 index 0000000..54af8d1 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_4x8.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: NV11_Load_4x8.asm +//---------------------------------------------------------------- + +#define NV11_LOAD_4x8 +#include "PL2_Load.inc" + +// Load 16x8 NV11 Y ------------------------------------------------------------ + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 8x8 NV11 UV ---------------------------------------------------------- + asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x8) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/4-1; >-1; -1) { + mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<32;16,2> + mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<32;16,2> + } + +// End of NV11_Load_4x8 diff --git a/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm b/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm new file mode 100644 index 0000000..86a1d35 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/NV11_Load_5x8.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: NV11_Load_5x8.asm +//---------------------------------------------------------------- + +#define NV11_LOAD_5x8 +#include "PL2_Load.inc" + +// Load 16x8 NV11 Y ------------------------------------------------------------ + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 12x8 NV11 UV --------------------------------------------------------- + asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (12x8) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2> + mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2> + } + +// End of NV11_Load_5x8 diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm new file mode 100644 index 0000000..dbc47d4 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x4.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: NV12_Load_8x4.asm +//---------------------------------------------------------------- + +#define NV12_LOAD_8x4 +#include "PL2_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 8x4 planar U and V ----------------------------------------------------- + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x4) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<32;16,2> + mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<32;16,2> + } + +// End of NV12_Load_8x4 diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm new file mode 100644 index 0000000..85f5ec7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_8x5.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: NV12_Load_8x5.asm +//---------------------------------------------------------------- + +#define NV12_LOAD_8x5 +#include "PL2_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 8x5 planar U and V ----------------------------------------------------- + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x5) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2> + mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2> + } + +// End of NV12_Load_8x5 diff --git a/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm b/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm new file mode 100644 index 0000000..b19f0b2 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/NV12_Load_9x5.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: NV12_Load_9x5.asm +//---------------------------------------------------------------- + +#define NV12_LOAD_9x5 +#include "PL2_Load.inc" + +// Load 16x8 planar Y ---------------------------------------------------------- + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + +// Load 9x5 planar U and V ----------------------------------------------------- + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (20x5) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (nY_NUM_OF_ROWS-1; >-1; -1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16) + } +#endif + $for(nUV_NUM_OF_ROWS-2; >-1; -1) { + mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<16;8,2> + mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<16;8,2> + } + +// End of NV12_Load_9x5 diff --git a/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm b/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm new file mode 100644 index 0000000..70d07eb --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_8x8.asm @@ -0,0 +1,41 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: P208_Load_8x8.asm +//---------------------------------------------------------------- + +#define P208_LOAD_8x8 +#include "PL2_Load.inc" + + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y Block width and height (16x8) (U/V block size is the same) + +// Load 16x8 P208 Y ------------------------------------------------------------ +#if !defined(LOAD_UV_ONLY) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + + // Load 16x8 planar UV ----------------------------------------------------- + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(0,%1*16) ubSRC_Y(0,%1*16) + } +#endif + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_U(0,%1*16) ubSRC_U(0,%1*32)<32;16,2> + mov (16) uwDEST_V(0,%1*16) ubSRC_U(0,%1*32+1)<32;16,2> + } + +// End of P208_Load_8x8.asm diff --git a/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm b/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm new file mode 100644 index 0000000..c6ff086 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/P208_Load_9x8.asm @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: P208_Load_9x8.asm +//---------------------------------------------------------------- + +#define P208_LOAD_9x8 +#include "PL2_Load.inc" + + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + +// Load 16x8 P208 Y ------------------------------------------------------------ +#if !defined(LOAD_UV_ONLY) + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_Y:ud // Y block width and height (16x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_CURRENT_SRC_Y:ud +#endif + + // Load 16x8 planar UV ----------------------------------------------------- + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (20x8) + mov (8) mMSGHDRU<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud + +// Convert to word-aligned format ---------------------------------------------- +#if !defined(LOAD_UV_ONLY) + $for (0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(0,%1*16) ubSRC_Y(0,%1*16) + } +#endif + $for (0; <nUV_NUM_OF_ROWS; 1) { + mov (16) uwDEST_U(0,%1*16) ubSRC_U(0,%1*32)<32;16,2> + mov (16) uwDEST_V(0,%1*16) ubSRC_U(0,%1*32+1)<32;16,2> + } + +// End of P208_Load_9x8.asm diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load.inc b/i965_drv_video/shaders/post_processing/Common/PA_Load.inc new file mode 100644 index 0000000..dee657e --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PA_Load.inc @@ -0,0 +1,42 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PA_Load.inc +// +// YUV422 data are first loaded to bottom I/O REGION_2, then unpacked to planar data +// and stored in top I/O REGION_1 + +#undef nY_NUM_OF_ROWS +#undef nUV_NUM_OF_ROWS + +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block +#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + +#if defined(PA_LOAD_8x8) + #define nDPR_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Y block size 32x8 + #define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold Y block data (8) +#endif +#if defined(PA_LOAD_9x8) + #define nDPR_BLOCK_SIZE_YUV_MAIN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // Main YUV block size 32x8 + #define nDPR_MSG_SIZE_YUV_MAIN nRESLEN_8 // # of MRF's to hold Y block data (8) + #define nDPR_BLOCK_SIZE_YUV_ADDITION nBLOCK_WIDTH_4+nBLOCK_HEIGHT_8 // Additional YUV block size 4x8 + #define nDPR_MSG_SIZE_YUV_ADDITION nRESLEN_1 // # of MRF's to hold Y block data (8) +#endif + +#define udSRC_YUV udBOT_Y_IO +#define nSRC_YUV_REG nBOT_Y + +#define uwDEST_Y uwTOP_Y +#define uwDEST_U uwTOP_U +#define uwDEST_V uwTOP_V + +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel + +// End of PA_Load.inc diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm b/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm new file mode 100644 index 0000000..3569bd1 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_8x8.asm @@ -0,0 +1,33 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PA_Load_8x8.asm +//---------------------------------------------------------------- + +#define PA_LOAD_8x8 +#include "PA_Load.inc" + +// Load 16x8 packed data block +// Packed data block should be loaded as 32x8 pixel block + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin + shl (1) rMSGSRC.0<1>:d acc0:w 1:w // H. block origin need to be doubled + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV:ud // Block width and height (32x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_CURRENT_SRC_YUV:ud + +// Unpack to "planar" YUV422 format in word-aligned bytes + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub nSRC_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(0, %1*16)<1> r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2) + mov (8) uwDEST_U(0, %1*8)<1> r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4) + mov (8) uwDEST_V(0, %1*8)<1> r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4) + } + +// End of PA_Load_8x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm b/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm new file mode 100644 index 0000000..90e56e7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PA_Load_9x8.asm @@ -0,0 +1,47 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PA_Load_9x8.asm +//---------------------------------------------------------------- +// This module loads 16x8 Y, 9x8 U and 9x8 V planar data blocks for CSC module +// and stores it in word-aligned format. +//---------------------------------------------------------------- + +#define PA_LOAD_9x8 +#include "PA_Load.inc" + +// Load 18x8 packed data block +// Packed data block should be loaded as 36x8 pixel block + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin + shl (1) rMSGSRC.0<1>:d acc0:w 1:w // H. block origin need to be doubled + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV_MAIN:ud // Block width and height (32x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_YUV(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_MAIN+nBI_CURRENT_SRC_YUV:ud + + add (1) rMSGSRC.0<1>:d rMSGSRC.0:d 32:w //the last 4 pixels are read again for optimization + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_YUV_ADDITION:ud // Block width and height (4x8) + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_YUV(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV_ADDITION+nBI_CURRENT_SRC_YUV:ud + +// Unpack to "planar" YUV422 format in word-aligned bytes + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub nSRC_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(0, %1*16)<1> r[pCF_Y_OFFSET, %1*nGRFWIB]REGION(16,2) + mov (8) uwDEST_U(0, %1*16)<1> r[pCF_U_OFFSET, %1*nGRFWIB]REGION(8,4) + mov (8) uwDEST_V(0, %1*16)<1> r[pCF_V_OFFSET, %1*nGRFWIB]REGION(8,4) + } + + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (1) uwDEST_U(0, %1*16+8)<1> r[pCF_U_OFFSET, %1*4+256]REGION(1,0) + mov (1) uwDEST_V(0, %1*16+8)<1> r[pCF_V_OFFSET, %1*4+256]REGION(1,0) + } + //UV expansion done in PL9x8_PL16x8.asm module + +// End of PA_Load_9x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm new file mode 100644 index 0000000..4461c89 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x4.asm @@ -0,0 +1,38 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL16x8_PL8x4.asm +//---------------------------------------------------------------- + +#include "common.inc" + +#ifndef DEST_U //DEST_U, DEST_V not defined + + #if (nSRC_REGION==nREGION_1) + #define DEST_Y uwTOP_Y + #define DEST_U uwTOP_U + #define DEST_V uwTOP_V + #elif (nSRC_REGION==nREGION_2) + #define DEST_Y uwBOT_Y + #define DEST_U uwBOT_U + #define DEST_V uwBOT_V + #endif + +#endif + +//Convert 444 from sampler to 422 +$for (0, 0; <8; 2, 1) { + mov (8) DEST_U(0,%2*8)<1> DEST_U(%1)<16;8,2> + mov (8) DEST_V(0,%2*8)<1> DEST_V(%1)<16;8,2> +} + +// Re-define new number of lines +#undef nUV_NUM_OF_ROWS +#define nUV_NUM_OF_ROWS 4 diff --git a/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm new file mode 100644 index 0000000..fd592db --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL16x8_PL8x8.asm @@ -0,0 +1,36 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL16x8_PL8x8.asm +//---------------------------------------------------------------- + +#include "common.inc" + +#ifndef DEST_U + + //DEST_U, DEST_V not defined + #if (nSRC_REGION==nREGION_1) + #define DEST_Y uwTOP_Y + #define DEST_U uwTOP_U + #define DEST_V uwTOP_V + #elif (nSRC_REGION==nREGION_2) + #define DEST_Y uwBOT_Y + #define DEST_U uwBOT_U + #define DEST_V uwBOT_V + #endif + +#endif + + +//Convert 444 from sampler to 422 +$for (0, 0; <8; 2, 1) { + mov DEST_U(%2)<1> DEST_U(%1)<16;8,2> + mov DEST_V(%2)<1> DEST_V(%1)<16;8,2> +} diff --git a/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc b/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc new file mode 100644 index 0000000..9feeba6 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL2_Load.inc @@ -0,0 +1,78 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL2_Load.inc + +#undef nY_NUM_OF_ROWS +#undef nUV_NUM_OF_ROWS + +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block +#define nDPR_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4) + + +#if defined(NV11_LOAD_4x8) + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8 // U/V block size 8x8 + #define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2) +#endif + +#if defined(NV11_LOAD_5x8) + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_12+nBLOCK_HEIGHT_8 // U/V block size 12x8 + #define nDPR_MSG_SIZE_UV nRESLEN_4 // # of MRF's to hold U/V block data (4) +#endif +#if defined(NV12_LOAD_8x4) + #define nUV_NUM_OF_ROWS 4 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // U/V block size 16x4 + #define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2) +#endif +#if defined(NV12_LOAD_8x5) + #define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number) + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_5 // U/V block size 16x5 + #define nDPR_MSG_SIZE_UV nRESLEN_3 // # of MRF's to hold U/V block data (3) +#endif +#if defined(NV12_LOAD_9x5) + #define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number) + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_20+nBLOCK_HEIGHT_5 // U/V block size 20x5 + #define nDPR_MSG_SIZE_UV nRESLEN_5 // # of MRF's to hold U/V block data (5) +#endif +#if defined(P208_LOAD_8x8) + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // U/V block size 16x8 + #define nDPR_MSG_SIZE_UV nRESLEN_4 // # of MRF's to hold U/V block data (4) +#endif +#if defined(P208_LOAD_9x8) + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_20+nBLOCK_HEIGHT_8 // U/V block size 20x8 + #define nDPR_MSG_SIZE_UV nRESLEN_8 // # of MRF's to hold U/V block data (8) +#endif + +// Source/destination region definitions +#if !defined(udSRC_Y) + #define udSRC_Y udBOT_Y_IO // Default Y source region is top Y region +#endif + +#if !defined(udSRC_U) + #define udSRC_U udBOT_U_IO // Default U source region is top U region +#endif + +#define ubSRC_Y ubBOT_Y +#define nSRC_Y_REG nBOT_Y +#define ubSRC_U ubBOT_U +#define nSRC_U_REG nBOT_U + +#define uwDEST_Y uwTOP_Y // However they can be transferred to word-aligned byte if desired +#define uwDEST_U uwTOP_U +#define uwDEST_V uwTOP_V + +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel + +// End of PL2_Load.inc diff --git a/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc b/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc new file mode 100644 index 0000000..323df08 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL3_Load.inc @@ -0,0 +1,59 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL3_Load.inc + +#undef nY_NUM_OF_ROWS +#undef nUV_NUM_OF_ROWS + +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block +#define nDPR_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4) + +#if defined(IMC3_LOAD_8x4) + #define nUV_NUM_OF_ROWS 4 // Number of U/V rows per block + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // U/V block size 8x4 + #define nDPR_MSG_SIZE_UV nRESLEN_1 // # of MRF's to hold U/V block data (1) +#endif +#if defined(IMC3_LOAD_8x5) + #define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number) + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_5 // U/V block size 8x5 + #define nDPR_MSG_SIZE_UV nRESLEN_2 // # of MRF's to hold U/V block data (2) +#endif +#if defined(IMC3_LOAD_9x5) + #define nUV_NUM_OF_ROWS 6 // Number of U/V rows per block (Rounded Up to Nearest Even Number) + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_12+nBLOCK_HEIGHT_5 // U/V block size 12x5 + #define nDPR_MSG_SIZE_UV nRESLEN_3 // # of MRF's to hold U/V block data (3) +#endif + +// Source/destination region definitions +#if !defined(udSRC_Y) + #define udSRC_Y udBOT_Y_IO // Default Y source region is top Y region +#endif + +#if !defined(udSRC_U) + #define udSRC_U udBOT_U_IO // Default U source region is top U region +#endif + +#if !defined(udSRC_V) + #define udSRC_V udBOT_V_IO // Default V source region is top V region +#endif + +#define ubSRC_Y ubBOT_Y // Loading data are always in byte type +#define ubSRC_U ubBOT_U +#define ubSRC_V ubBOT_V + +#define uwDEST_Y uwTOP_Y // However they can be transferred to word-aligned byte if desired +#define uwDEST_U uwTOP_U +#define uwDEST_V uwTOP_V + +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel + +// End of PL3_Load.inc diff --git a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm new file mode 100644 index 0000000..653e634 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.asm @@ -0,0 +1,86 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +#include "PL4x8_Save_NV11.inc" + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +#if !defined(SAVE_UV_ONLY) +// Save current planar frame Y block data (16x8) ------------------------------- + + mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin + mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) + +///* Yoni - masking is not relevant for ILK?!? +// +// //Use the mask to determine which pixels shouldn't be over-written +// cmp.ge.f0.0 (1) NULLREG BLOCK_MASK_D:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified +// (f0.0) jmpi WritePlanarToDataPort +// +// //If mask is not all 1's, then load the entire 16x8 block +// //so that only those bytes may be modified that need to be (using the mask) +// send (8) SRC_YD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00040000+BI_DEST_Y:ud //16x8 +// +// asr (2) MSGSRC.0<1>:ud ORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's +// mov (1) MSGSRC.2<1>:ud 0x00030007:ud // Block width and height (8x4) +// send (8) SRC_UD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00010000+BI_DEST_U:ud +// send (8) SRC_VD(0)<1> MSGHDR MSGSRC<8;8,1>:ud DWBRMSGDSC+0x00010000+BI_DEST_V:ud +// +// //Restore the origin information +// mov (2) MSGSRC.0<1>:ud ORIX<2;2,1>:w // Block origin +// mov (1) MSGSRC.2<1>:ud 0x0007000F:ud // Block width and height (16x8) +// +// //expand U and V to be aligned on word boundary +// mov (16) SRC_UW(1)<1> SRC_U(0,16) +// mov (16) SRC_UW(0)<1> SRC_U(0, 0) +// mov (16) SRC_VW(1)<1> SRC_V(0,16) +// mov (16) SRC_VW(0)<1> SRC_V(0, 0) +// +// //Merge the data +// mov (1) f0.1:uw BLOCK_MASK_V:uw //Load the mask on flag reg +// (f0.1) mov (8) TEMP0<1>:uw BLOCK_MASK_H:uw +// (-f0.1) mov (8) TEMP0<1>:uw 0:uw +// +// // Destination is Word aligned +// $for(0; <Y_ROW_SIZE; 2) { +// mov (1) f0.1:uw TEMP(0,%1)<0;1,0> +// (-f0.1) mov (16) DEST_Y(0, %1*32)<2> SRC_Y(0, %1*16) +// (-f0.1) mov (16) DEST_U(0, %1*8)<1> SRC_U(0, %1*8) //only works for Word aligned Byte data +// (-f0.1) mov (16) DEST_V(0, %1*8)<1> SRC_V(0, %1*8) //only works for Word aligned Byte data +// +// mov (1) f0.1:uw TEMP(0,1+%1)<0;1,0> +// (-f0.1) mov (16) DEST_Y(0, 1+%1*32)<2> SRC_Y(0, 1+%1*16) +// +// } +// +//*/ Yoni - masking is not relevant for ILK?!? + +WritePlanarToDataPort: + $for(0,0; <nY_NUM_OF_ROWS; 2,1) { + mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud +#endif + +// Save U/V data block in planar format (4x8) ---------------------------------- + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin + asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + $for(0,0; <nY_NUM_OF_ROWS;4,1) { + mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud + +// End of PL4x8_Save_NV11 + diff --git a/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc new file mode 100644 index 0000000..ebd134e --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL4x8_Save_NV11.inc @@ -0,0 +1,60 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//Module name: PL8x4_Save_NV11.inc +// +// Setup for storing planar data +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4) +#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_8 // U/V interleaved block width and height (8x8) +#define nDPW_MSG_SIZE_UV nMSGLEN_2 // # of MRF's to hold U/V block data (2) + +#if (nSRC_REGION==nREGION_1) + #define udSRC_Y udBOT_Y_IO + #define udSRC_U udBOT_U_IO + #define udSRC_V udBOT_V_IO + #define ubSRC_Y ubBOT_Y + #define ubSRC_U ubBOT_U + #define ubSRC_V ubBOT_V + + #define uwSRC_U uwBOT_U //For masking operation + #define uwSRC_V uwBOT_V + + #define ub2DEST_Y ub2TOP_Y + #define ub2DEST_U ub2TOP_U + #define ub2DEST_V ub2TOP_V + +#elif (nSRC_REGION==nREGION_2) + #define udSRC_Y udTOP_Y_IO + #define udSRC_U udTOP_U_IO + #define udSRC_V udTOP_V_IO + #define ubSRC_Y ubTOP_Y + #define ubSRC_U ubTOP_U + #define ubSRC_V ubTOP_V + + #define uwSRC_U uwTOP_U //For masking operation + #define uwSRC_V uwTOP_V + + #define ub2DEST_Y ub2BOT_Y + #define ub2DEST_U ub2BOT_U + #define ub2DEST_V ub2BOT_V + +#endif + +///* Yoni - masking is not relevant for ILK?!? +//#define TEMP0 REG(r,54) +//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw +///* Yoni - masking is not relevant for ILK?!? + + diff --git a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm new file mode 100644 index 0000000..909f8a7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL16x8.asm @@ -0,0 +1,29 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL5x8_PL16x8.asm + +#include "Expansion.inc" + +//------------------------------ Horizontal Upconversion ----------------------------- + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*16+7)<1;2,0> uwDEST_U(0, %1*16+7)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*16+7)<1;2,0> uwDEST_V(0, %1*16+7)<1;2,1> + avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1> + } + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + avg.sat (16) uwDEST_U(0, %1*32+16) uwDEST_U(0, %1*32+18)<1;2,0> uwDEST_U(0, %1*32+18)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*32+16) uwDEST_V(0, %1*32+18)<1;2,0> uwDEST_V(0, %1*32+18)<1;2,1> + avg.sat (16) uwDEST_U(0, %1*32) uwDEST_U(0, %1*32)<1;2,0> uwDEST_U(0, %1*32)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*32) uwDEST_V(0, %1*32)<1;2,0> uwDEST_V(0, %1*32)<1;2,1> + } + +// End of PL5x8_PL16x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm new file mode 100644 index 0000000..068b2ba --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL5x8_PL8x8.asm @@ -0,0 +1,21 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL5x8_PL8x8.asm + +#include "Expansion.inc" + +//------------------------------ Horizontal Upconversion ----------------------------- + $for (0; <nUV_NUM_OF_ROWS; 1) { + avg.sat (8) uwDEST_U(0, %1*8) uwDEST_U(0, %1*8)<1;2,0> uwDEST_U(0, %1*8)<1;2,1> + avg.sat (8) uwDEST_V(0, %1*8) uwDEST_V(0, %1*8)<1;2,0> uwDEST_V(0, %1*8)<1;2,1> + } + +// End of PL5x8_PL8x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm new file mode 100644 index 0000000..c286992 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.asm @@ -0,0 +1,88 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x4_Save_IMC3.asm +// +// Save planar YUV420 frame data block of size 16x8 + +#include "PL8x4_Save_IMC3.inc" + +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WritePlanarToDataPort + + //If mask is not all 1's, then load the entire 16x8 block + //so that only those bytes may be modified that need to be (using the mask) + + // Load 16x8 planar Y ---------------------------------------------------------- + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_Y(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud + // Load 8x4 planar U and V ----------------------------------------------------- + asr (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // Block width and height (8x4) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_U(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_U:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + send (8) udSRC_V(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_V:ud + + //expand U and V to be aligned on word boundary - Y remains in bytes + $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) { + mov (16) uwSRC_U(0, %1*16)<1> ubSRC_U(0, %1*16) + mov (16) uwSRC_V(0, %1*16)<1> ubSRC_V(0, %1*16) + } + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + // Destination is Word aligned + $for(0; <nY_NUM_OF_ROWS; 2) { + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (16) ub2DEST_Y(0, %1*32)<2> ubSRC_Y(0, %1*16) + (-f0.1) mov (16) ub2DEST_U(0, %1*8)<1> ubSRC_U(0, %1*8) //only works for Word aligned Byte data + (-f0.1) mov (16) ub2DEST_V(0, %1*8)<1> ubSRC_V(0, %1*8) //only works for Word aligned Byte data + + mov (1) f0.1:uw uwMASK_TEMP(0,1+%1)<0;1,0> + (-f0.1) mov (16) ub2DEST_Y(0, 1+%1*32)<2> ubSRC_Y(0, 1+%1*16) + } + +WritePlanarToDataPort: +#if !defined(SAVE_UV_ONLY) +// Save current planar frame Y block data (16x8) ------------------------------- + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + $for(0,0; <nY_NUM_OF_ROWS; 2,1) { + mov(16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2) + mov(16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud +#endif +// Save U/V data block in planar format (8x4) ---------------------------------- + asr (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // Block width and height (8x4) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +// Save U picture data --------------------------------------------------------- + mov (16) mubMSGPAYLOAD(0,0)<1> ub2DEST_U(0)REGION(16,2) // U rows 0,1 + mov (16) mubMSGPAYLOAD(0,16)<1> ub2DEST_U(1)REGION(16,2) // U rows 2,3 + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_U:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +// Save V picture data --------------------------------------------------------- + mov (16) mubMSGPAYLOAD(0,0)<1> ub2DEST_V(0)REGION(16,2) // V rows 0,1 + mov (16) mubMSGPAYLOAD(0,16)<1> ub2DEST_V(1)REGION(16,2) // V rows 2,3 + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_V:ud + +// End of PL8x4_Save_IMC3 diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc new file mode 100644 index 0000000..3b1df17 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_IMC3.inc @@ -0,0 +1,62 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x4_Save_IMC3.inc +// +// Setup for storing planar data +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +// For saving +#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4) +#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // U/V block size 8x4 +#define nDPW_MSG_SIZE_UV nMSGLEN_1 // # of MRF's to hold U/V block data (1) + +// For masking +#undef nDPR_MSG_SIZE_Y +#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4) +#undef nDPR_MSG_SIZE_UV +#define nDPR_MSG_SIZE_UV nRESLEN_1 // # of MRF's to hold U/V block data (1) +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + +#if (nSRC_REGION==nREGION_1) + // For saving + #define ub2DEST_Y ub2TOP_Y + #define ub2DEST_U ub2TOP_U + #define ub2DEST_V ub2TOP_V + //For masking operation + #define udSRC_Y udBOT_Y_IO + #define udSRC_U udBOT_U_IO + #define udSRC_V udBOT_V_IO + #define ubSRC_Y ubBOT_Y + #define ubSRC_U ubBOT_U + #define ubSRC_V ubBOT_V + #define uwSRC_U uwBOT_U + #define uwSRC_V uwBOT_V + +#elif (nSRC_REGION==nREGION_2) + // For saving + #define ub2DEST_Y ub2BOT_Y + #define ub2DEST_U ub2BOT_U + #define ub2DEST_V ub2BOT_V + //For masking operation + #define udSRC_Y udTOP_Y_IO + #define udSRC_U udTOP_U_IO + #define udSRC_V udTOP_V_IO + #define ubSRC_Y ubTOP_Y + #define ubSRC_U ubTOP_U + #define ubSRC_V ubTOP_V + #define uwSRC_U uwTOP_U + #define uwSRC_V uwTOP_V + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm new file mode 100644 index 0000000..b54a316 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.asm @@ -0,0 +1,102 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +// Module name: PL8x4_Save_NV12.asm +// +// Save entire current planar frame data block of size 16x8 +//--------------------------------------------------------------- +// Symbols needed to be defined before including this module +// +// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned +// ORIX: +//--------------------------------------------------------------- + +#include "PL8x4_Save_NV12.inc" + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +#if !defined(SAVE_UV_ONLY) +// Save current planar frame Y block data (16x8) ------------------------------- + + mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin + mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) +#endif + +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0 (1) dNULLREG acc0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WritePlanarToDataPort + +//If mask is not all 1's, then load the entire 16x8 block +//so that only those bytes may be modified that need to be (using the mask) + send (8) udSRC_Y(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_Y+nBI_DESTINATION_Y:ud //16x8 + + asr (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w 1:w { NoDDClr } // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud { NoDDChk } // Block width and height (16x4) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud //move message desrcptor to the message header + send (8) udSRC_U(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_DESTINATION_UV:ud + +//Restore the origin information + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud //move message desrcptor to the message header + +//Merge the data + mov (1) f0.1:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.1) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw + (-f0.1) mov (8) rMASK_TEMP<1>:uw 0:uw + +//convert the mask from 16bits to 8bits by selecting every other bit + mov (1) udMASK_TEMP1(0,0)<1> 0x00040001:ud + mov (1) udMASK_TEMP1(0,1)<1> 0x00400010:ud + mov (1) udMASK_TEMP1(0,2)<1> 0x04000100:ud + mov (1) udMASK_TEMP1(0,3)<1> 0x40001000:ud + +//merge the loaded block with the current block + $for(0,0; <nY_NUM_OF_ROWS; 2,1) { + mov (1) f0.1:uw uwMASK_TEMP(0, %1)<0;1,0> + (-f0.1) mov (16) ubDEST_Y(0,%1*32)<2> ubSRC_Y(0,%1*16) + + and.nz.f0.1 (8) wNULLREG uwMASK_TEMP(0,%1)<0;1,0> uwMASK_TEMP1(0,0) //change the mask by selecting every other bit + (-f0.1) mov (8) ubDEST_U(0, %2*16)<2> ub2SRC_U(0, %1*8)<16;8,2> + (-f0.1) mov (8) ubDEST_V(0, %2*16)<2> ub2SRC_U(0, %1*8+1)<16;8,2> + + mov (1) f0.1:uw uwMASK_TEMP(0,1+%1)<0;1,0> + (-f0.1) mov (16) ubDEST_Y(0, (1+%1)*32)<2> ubSRC_Y(0, (1+%1)*16) + + } + +WritePlanarToDataPort: +#if !defined(SAVE_UV_ONLY) + $for(0,0; <nY_NUM_OF_ROWS; 2,1) { + mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud +#endif + +//** Save 8x4 packed U and V ----------------------------------------------------- +// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could +// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether +//it is possible to do asr on mMSGHDR so we use rMSGSRC. + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // U/V block width and height (16x4) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + $for(0,0; <nY_NUM_OF_ROWS;4,1) { + mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud + +// End of PL8x4_Save_NV12 + diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc new file mode 100644 index 0000000..879d7e3 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x4_Save_NV12.inc @@ -0,0 +1,85 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//Module name: PL8x4_Save_NV12.inc +// +// Setup for storing planar data +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols +#undef nDPW_BLOCK_SIZE_Y +#undef nDPW_MSG_SIZE_Y +#undef nDPW_BLOCK_SIZE_UV +#undef nDPW_MSG_SIZE_UV + +#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4) +#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // U/V interleaved block width and height (16x4) +#define nDPW_MSG_SIZE_UV nMSGLEN_2 // # of MRF's to hold U/V block data (2) + +// For masking +#undef nDPR_MSG_SIZE_Y +#define nDPR_MSG_SIZE_Y nRESLEN_4 // # of MRF's to hold Y block data (4) +#undef nDPR_MSG_SIZE_UV +#define nDPR_MSG_SIZE_UV nRESLEN_2 +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF +#define rMASK_TEMP1 REG(r,nTEMP1) +.declare udMASK_TEMP1 Base=rMASK_TEMP1 ElementSize=4 SrcRegion=<4;4,1> Type=ud //1 GRF +.declare uwMASK_TEMP1 Base=rMASK_TEMP1 ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + + +#if (nSRC_REGION==nREGION_1) + #define udSRC_Y udBOT_Y_IO + #define udSRC_U udBOT_U_IO + #define udSRC_V udBOT_V_IO + #define ubSRC_Y ubBOT_Y + #define ubSRC_U ubBOT_U + #define ubSRC_V ubBOT_V + + #define uwSRC_U uwBOT_U //For masking operation + #define uwSRC_V uwBOT_V + + #define ub2DEST_Y ub2TOP_Y + #define ub2DEST_U ub2TOP_U + #define ub2DEST_V ub2TOP_V + + #define ubDEST_Y ubTOP_Y + #define ubDEST_U ubTOP_U + #define ubDEST_V ubTOP_V + + #define ub2SRC_U ub2BOT_U +#elif (nSRC_REGION==nREGION_2) + #define udSRC_Y udTOP_Y_IO + #define udSRC_U udTOP_U_IO + #define udSRC_V udTOP_V_IO + #define ubSRC_Y ubTOP_Y + #define ubSRC_U ubTOP_U + #define ubSRC_V ubTOP_V + + #define uwSRC_U uwTOP_U //For masking operation + #define uwSRC_V uwTOP_V + + #define ub2DEST_Y ub2BOT_Y + #define ub2DEST_U ub2BOT_U + #define ub2DEST_V ub2BOT_V + + #define ubDEST_Y ubBOT_Y + #define ubDEST_U ubBOT_U + #define ubDEST_V ubBOT_V + + #define ub2SRC_U ub2TOP_U +#endif + +///* Yoni - masking is not relevant for ILK?!? +//#define TEMP0 REG(r,54) +//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw +///* Yoni - masking is not relevant for ILK?!? + diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm b/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm new file mode 100644 index 0000000..5b98be0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x5_PL8x8.asm @@ -0,0 +1,27 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x5_PL8x8.asm + +#include "Expansion.inc" + +//------------------------------- Vertical Upconversion ------------------------------ + avg.sat (8) uwDEST_U(0, 3*16+8)<1> uwDEST_U(0, 3*8) uwDEST_U(0, (1+3)*8) // Optimization + avg.sat (8) uwDEST_V(0, 3*16+8)<1> uwDEST_V(0, 3*8) uwDEST_V(0, (1+3)*8) // Optimization + + $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) { + mov (8) uwDEST_U(0, (1+%1)*16)<1> uwDEST_U(0, (1+%1)*8) + avg.sat (8) uwDEST_U(0, %1*16+8)<1> uwDEST_U(0, %1*8) uwDEST_U(0, (1+%1)*8) + + mov (8) uwDEST_V(0, (1+%1)*16)<1> uwDEST_V(0, (1+%1)*8) + avg.sat (8) uwDEST_V(0, %1*16+8)<1> uwDEST_V(0, %1*8) uwDEST_V(0, (1+%1)*8) + } + +// End of PL8x5_PL8x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm new file mode 100644 index 0000000..f21d224 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.asm @@ -0,0 +1,30 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x8_PL8x4.asm +// +// Convert PL 8x8 to PL8x4 in GRF +//--------------------------------------------------------------- +// Symbols needed to be defined before including this module +// +// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned +// ORIX: +//--------------------------------------------------------------- + +#include "PL8x8_PL8x4.inc" + +// Convert PL8x8 to PL8x4 --------------------------------------------------------- + + mov (8) ubDEST_U(0,16)<2> ubDEST_U(1)<16;8,2> //selecting U every other row + mov (16) ubDEST_U(0,32)<2> ubDEST_U(2)<32;8,2> //selecting U every other row + mov (8) ubDEST_V(0,16)<2> ubDEST_V(1)<16;8,2> //selecting V every other row + mov (16) ubDEST_V(0,32)<2> ubDEST_V(2)<32;8,2> //selecting V every other row + +// End of PL8x8_PL8x4.asm -------------------------------------------------------
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc new file mode 100644 index 0000000..bec884e --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_PL8x4.inc @@ -0,0 +1,36 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x8_PL8x4.inc +// +// Setup module for convert PL8x8 to PL8x4 +// +// + +// Source/destination region definitions +// +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +#if (nSRC_REGION==nREGION_1) + + //REGION_1 selected + #define ubDEST_Y ubTOP_Y + #define ubDEST_U ubTOP_U + #define ubDEST_V ubTOP_V + +#elif (nSRC_REGION==nREGION_2) + + //REGION_2 selected + #define ubDEST_Y ubBOT_Y + #define ubDEST_U ubBOT_U + #define ubDEST_V ubBOT_V + + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm new file mode 100644 index 0000000..6b3258f --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.asm @@ -0,0 +1,56 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +// Module name: PL8x8_Save_P208.asm +// +// Save entire current planar frame data block of size 16x8 +//--------------------------------------------------------------- +// Symbols needed to be defined before including this module +// +// DWORD_ALIGNED_DEST: only if DEST_Y, DEST_U, DEST_V data are DWord aligned +// ORIX: +//--------------------------------------------------------------- + +#include "PL8x8_Save_P208.inc" + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +#if !defined(SAVE_UV_ONLY) +// Save current planar frame Y block data (16x8) ------------------------------- + + mov (2) mMSGHDR.0<1>:d wORIX<2;2,1>:w // Block origin + mov (1) mMSGHDR.2<1>:ud nDPW_BLOCK_SIZE_Y:ud // Block width and height (16x8) + +WritePlanarToDataPort: + $for(0,0; <nY_NUM_OF_ROWS; 2,1) { + mov (16) mubMSGPAYLOAD(%2,0)<1> ub2DEST_Y(%1)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,16)<1> ub2DEST_Y(%1+1)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_Y+nBI_DESTINATION_Y:ud +#endif + +//** Save 8x8 packed U and V ----------------------------------------------------- +// we could write directly wORIX to mMSGHDR and then execute asr on it, that way we could +// avoid using rMSGSRC as a buffer and have one command less in code, but it is unknown whether +//it is possible to do asr on mMSGHDR so we use rMSGSRC. + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin + + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_UV:ud // U/V block width and height (16x4) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + $for(0,0; <nY_NUM_OF_ROWS;2,1) { + mov (16) mubMSGPAYLOAD(%2,0)<2> ub2DEST_U(%2)REGION(16,2) + mov (16) mubMSGPAYLOAD(%2,1)<2> ub2DEST_V(%2)REGION(16,2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_UV+nBI_DESTINATION_UV:ud + +//End of PL8x8_Save_P208.asm + diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc new file mode 100644 index 0000000..e3b7d09 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_P208.inc @@ -0,0 +1,61 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +//Module name: PL8x8_Save_P208.inc +// +// Setup for storing planar data +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +#define nDPW_BLOCK_SIZE_Y nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // Y block size 16x8 +#define nDPW_MSG_SIZE_Y nMSGLEN_4 // # of MRF's to hold Y block data (4) +#define nDPW_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // U/V interleaved block width and height (16x8) +#define nDPW_MSG_SIZE_UV nMSGLEN_4 // # of MRF's to hold U/V block data (4) + +#if (nSRC_REGION==nREGION_1) + #define udSRC_Y udBOT_Y_IO + #define udSRC_U udBOT_U_IO + #define udSRC_V udBOT_V_IO + #define ubSRC_Y ubBOT_Y + #define ubSRC_U ubBOT_U + #define ubSRC_V ubBOT_V + + #define uwSRC_U uwBOT_U //For masking operation + #define uwSRC_V uwBOT_V + + #define ub2DEST_Y ub2TOP_Y + #define ub2DEST_U ub2TOP_U + #define ub2DEST_V ub2TOP_V + +#elif (nSRC_REGION==nREGION_2) + #define udSRC_Y udTOP_Y_IO + #define udSRC_U udTOP_U_IO + #define udSRC_V udTOP_V_IO + #define ubSRC_Y ubTOP_Y + #define ubSRC_U ubTOP_U + #define ubSRC_V ubTOP_V + + #define uwSRC_U uwTOP_U //For masking operation + #define uwSRC_V uwTOP_V + + #define ub2DEST_Y ub2BOT_Y + #define ub2DEST_U ub2BOT_U + #define ub2DEST_V ub2BOT_V + +#endif + +///* Yoni - masking is not relevant for ILK?!? +//#define TEMP0 REG(r,54) +//.declare TEMP Base=TEMP0 ElementSize=2 SrcRegion=<8;8,1> Type=uw +///* Yoni - masking is not relevant for ILK?!? + + diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm new file mode 100644 index 0000000..d22c76d --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.asm @@ -0,0 +1,71 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x8_Save_PA.asm +// +// Save planar YUV422 to packed YUV422 format data +// +// Note: SRC_* must reference to regions with data type "BYTE" +// in order to save to byte-aligned byte location + +#include "PL8x8_Save_PA.inc" + + add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub nDEST_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block + + // Pack Y + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2> ubSRC_Y(0,%1*32) + } + + // Pack U/V + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_U_OFFSET, %1*nGRFWIB]<4> ubSRC_U(0, %1*16) + mov (8) r[pCF_V_OFFSET, %1*nGRFWIB]<4> ubSRC_V(0, %1*16) + } + + shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 1:w { NoDDClr } // H. block origin need to be doubled + mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_YUV:ud { NoDDChk } // Block width and height (32x8) + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WritePackedToDataPort + + //If mask is not all 1's, then load the entire 32x8 block + //so that only those bytes may be modified that need to be (using the mask) + + // Load 32x8 packed YUV 422 ---------------------------------------------------- + send (8) udSRC_YUV(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + // Destination is Byte aligned + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (16) uwDEST_YUV(%1)<1> uwSRC_YUV(%1) //check the UV merge - vK + } + +WritePackedToDataPort: + // Packed YUV data are stored in one of the I/O regions before moving to MRF + // Note: This is necessary since indirect addressing is not supported for MRF. + // Packed data block should be saved as 32x8 pixel block + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1)REGION(8,1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_YUV+nBI_DESTINATION_YUV:ud + +// End of PL8x8_Save_PA diff --git a/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc new file mode 100644 index 0000000..a5cb4a3 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL8x8_Save_PA.inc @@ -0,0 +1,52 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL8x8_Save_PA.inc +// +// Setup for storing packed data +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +// For saving +#define nDPW_BLOCK_SIZE_YUV nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // YUV block size 32x8 +#define nDPW_MSG_SIZE_YUV nMSGLEN_8 // # of MRF's to hold YUV block data (8) + +// For masking +#undef nDPR_MSG_SIZE_YUV +#define nDPR_MSG_SIZE_YUV nRESLEN_8 // # of MRF's to hold YUV block data (8) +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + +#if (nSRC_REGION==nREGION_1) + // For saving + #define udSRC_YUV udTOP_Y_IO + #define udDEST_YUV udBOT_Y_IO + #define nDEST_YUV_REG nBOT_Y + //For masking operation + #define ubSRC_Y ub2TOP_Y + #define ubSRC_U ub2TOP_U + #define ubSRC_V ub2TOP_V + #define uwSRC_YUV uwTOP_Y + #define uwDEST_YUV uwBOT_Y + +#elif (nSRC_REGION==nREGION_2) + // For saving + #define udSRC_YUV udBOT_Y_IO + #define udDEST_YUV udTOP_Y_IO + #define nDEST_YUV_REG nTOP_Y + //For masking operation + #define ubSRC_Y ub2BOT_Y + #define ubSRC_U ub2BOT_U + #define ubSRC_V ub2BOT_V + #define uwSRC_YUV uwBOT_Y + #define uwDEST_YUV uwTOP_Y + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm new file mode 100644 index 0000000..697454f --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL9x5_PL16x8.asm @@ -0,0 +1,37 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL9x5_PL16x8.asm + +#define EXPAND_9x5 +#include "Expansion.inc" + +//------------------------------ Horizontal Upconversion ----------------------------- + $for (nUV_NUM_OF_ROWS-2; >-1; -1) { + avg.sat (16) uwDEST_U(0, %1*16)<1> uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*16)<1> uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1> + } + +#undef nUV_NUM_OF_ROWS +#define nUV_NUM_OF_ROWS 8 //use packed version of all post-processing kernels + +//------------------------------- Vertical Upconversion ------------------------------ + avg.sat (16) uwDEST_U(0, 3*32+16)<1> uwDEST_U(0, 3*16) uwDEST_U(0, (1+3)*16) + avg.sat (16) uwDEST_V(0, 3*32+16)<1> uwDEST_V(0, 3*16) uwDEST_V(0, (1+3)*16) + + $for(nUV_NUM_OF_ROWS/2-2; >-1; -1) { + mov (16) uwDEST_U(0, (1+%1)*32)<1> uwDEST_U(0, (1+%1)*16) + avg.sat (16) uwDEST_U(0, %1*32+16)<1> uwDEST_U(0, %1*16) uwDEST_U(0, (1+%1)*16) + + mov (16) uwDEST_V(0, (1+%1)*32)<1> uwDEST_V(0, (1+%1)*16) + avg.sat (16) uwDEST_V(0, %1*32+16)<1> uwDEST_V(0, %1*16) uwDEST_V(0, (1+%1)*16) + } + +// End of PL9x5_PL16x8 diff --git a/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm b/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm new file mode 100644 index 0000000..b0fa549 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/PL9x8_PL16x8.asm @@ -0,0 +1,21 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: PL9x5_PL16x8.asm + +#include "Expansion.inc" + +//------------------------------ Horizontal Upconversion ----------------------------- + $for (0; <nUV_NUM_OF_ROWS; 1) { + avg.sat (16) uwDEST_U(0, %1*16)<1> uwDEST_U(0, %1*16)<1;2,0> uwDEST_U(0, %1*16)<1;2,1> + avg.sat (16) uwDEST_V(0, %1*16)<1> uwDEST_V(0, %1*16)<1;2,0> uwDEST_V(0, %1*16)<1;2,1> + } + +// End of PL9x5_PL16x8
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm new file mode 100644 index 0000000..7903d63 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.asm @@ -0,0 +1,88 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_RGB.asm +// +// Save packed ARGB 444 frame data block of size 16x8 +// +// To save 16x8 block (64x8 byte layout for ARGB8888) we need 2 send instructions +// --------- +// | 1 | 2 | +// --------- + +#include "RGB16x8_Save_RGB.inc" + + shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 2:w { NoDDClr } // H. block origin need to be quadrupled + mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant) + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_ARGB:ud { NoDDChk } // Block width and height (32x8) + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WriteARGBToDataPort + + //If mask is not all 1's, then load the entire 64x8 block + //so that only those bytes may be modified that need to be (using the mask) + + // Load first block 16x8 packed ARGB 444 --------------------------------------- + or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF00FF00:ud //Check first block + cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud + (f0.0) jmpi SkipFirstBlockMerge //If full mask then skip this block + + send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1) + } + +SkipFirstBlockMerge: + // Load second block 16x8 packed ARGB 444 --------------------------------------- + or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF0000FF:ud //Check second block + cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud + (f0.0) jmpi WriteARGBToDataPort //If full mask then skip this block + + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part + send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud // Point to 1st part again + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) shr (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw 8:uw //load the mask for second block + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1) + } + +WriteARGBToDataPort: + // Move packed data to MRF and output + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*2+1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + +// End of RGB16x8_Save_RGB diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc new file mode 100644 index 0000000..3dee653 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB.inc @@ -0,0 +1,38 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_RGB.inc +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +// For saving +#define nDPW_BLOCK_SIZE_ARGB nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // ARGB block size 32x8 +#define nDPW_MSG_SIZE_ARGB nMSGLEN_8 // # of MRF's to hold ARGB block data (8) + +// For masking +#undef nDPR_MSG_SIZE_ARGB +#define nDPR_MSG_SIZE_ARGB nRESLEN_8 // # of MRF's to hold ARGB block data (8) +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + +#if (nSRC_REGION==nREGION_1) + // For saving + #define udDEST_ARGB udTOP_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache + //For masking operation + #define udSRC_ARGB udBOT_Y_IO //To hold the destination data that shouldn't be modified + +#elif (nSRC_REGION==nREGION_2) + // For saving + #define udDEST_ARGB udBOT_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache + //For masking operation + #define udSRC_ARGB udTOP_Y_IO //To hold the destination data that shouldn't be modified + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm new file mode 100644 index 0000000..3fbb9eb --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.asm @@ -0,0 +1,72 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_RGB16.asm +// +// Save packed RGB565 frame data block of size 16x8 +// +// To save 16x8 block (32x8 byte layout for RGB565) we need 1 send instruction +// ----- +// | 1 | +// ----- + +#include "RGB16x8_Save_RGB16.inc" + +//convert 32 bit RGB to 16 bit RGB + // Truncate A8R8G8B8 to A6R5G6B5 within byte. + // That is keeping 5 MSB of R and B, and 6 MSB of G. + + $for (0, 0; <nY_NUM_OF_ROWS; 1, 2) { + shr uwCSC_TEMP(%1,0)<1> ubDEST_ARGB(%2,0)<32;8,4> 3:w // B >> 3 + + shl (16) uwTEMP_RGB16(0)<1> uwDEST_ARGB(%2,1)<16;8,2> 8:w // R << 8 + and (16) uwTEMP_RGB16(0)<1> uwTEMP_RGB16(0) 0xF800:uw + or (16) uwCSC_TEMP(%1,0)<1> uwCSC_TEMP(%1,0)<16;16,1> uwTEMP_RGB16(0) + + shr (16) uwTEMP_RGB16(0)<1> uwDEST_ARGB(%2,0)<16;8,2> 5:w // G >> 5 + and (16) uwTEMP_RGB16(0)<1> uwTEMP_RGB16(0) 0x07E0:uw + or (16) uwCSC_TEMP(%1,0)<1> uwCSC_TEMP(%1,0)<16;16,1> uwTEMP_RGB16(0) + } + + mov (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w // Block origin (1st quadrant) + shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 1:w // H. block origin need to be doubled for byte offset + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_RGB16:ud // Block width and height (32x8) + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WriteRGB16ToDataPort + + //If mask is not all 1's, then load the entire 32x8 block + //so that only those bytes may be modified that need to be (using the mask) + + // Load 32x8 packed RGB565 ----------------------------------------------------- + send (8) udSRC_RGB16(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (16) uwCSC_TEMP(%1)<1> uwSRC_RGB16(%1) + } + +WriteRGB16ToDataPort: + // Move packed data to MRF and output + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udCSC_TEMP(%1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_RGB16+nBI_DESTINATION_RGB:ud + +// End of RGB16x8_Save_RGB16 diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc new file mode 100644 index 0000000..8161432 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_RGB16.inc @@ -0,0 +1,49 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_RGB16.inc +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +// For saving +#define nDPW_BLOCK_SIZE_RGB16 nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // RGB16 block size 32x8 +#define nDPW_MSG_SIZE_RGB16 nMSGLEN_8 // # of MRF's to hold RGB16 block data (8) + +// For conversion to 16bit +.declare uwTEMP_RGB16 Base=REG(r,nTEMP1) ElementSize=2 SrcRegion=<16;16,1> Type=uw //1 GRF + +// For masking +#undef nDPR_MSG_SIZE_RGB16 +#define nDPR_MSG_SIZE_RGB16 nRESLEN_8 // # of MRF's to hold ARGB block data (8) +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + +#if (nSRC_REGION==nREGION_1) + // For saving + #define ubDEST_ARGB ubTOP_Y //Data from previous module + #define uwDEST_ARGB uwTOP_Y //Data from previous module + #define udCSC_TEMP udBOT_Y_IO //Data Converted to 16 bits + #define uwCSC_TEMP uwBOT_Y + //For masking operation + #define udSRC_RGB16 udTOP_Y_IO //To hold the destination data that shouldn't be modified + #define uwSRC_RGB16 uwTOP_Y + +#elif (nSRC_REGION==nREGION_2) + // For saving + #define ubDEST_ARGB ubBOT_Y //Data from previous module + #define uwDEST_ARGB uwBOT_Y //Data from previous module + #define udCSC_TEMP udTOP_Y_IO //Data Converted to 16 bits + #define uwCSC_TEMP uwTOP_Y + //For masking operation + #define udSRC_RGB16 udBOT_Y_IO //To hold the destination data that shouldn't be modified + #define uwSRC_RGB16 uwBOT_Y + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm new file mode 100644 index 0000000..915f797 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.asm @@ -0,0 +1,107 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_Y416.asm +// +// Save packed ARGB 444 frame data block of size 16x8 +// +// To save 16x8 block (128x8 byte layout for ARGB 16bit per component) we need 4 send instructions +// ----------------- +// | 1 | 2 | 3 | 4 | +// ----------------- + +#include "RGB16x8_Save_RGB.inc" + + shl (1) rMSGSRC.0<1>:d wORIX<0;1,0>:w 3:w { NoDDClr } // H. block origin need to become 8 times + mov (1) rMSGSRC.1<1>:d wORIY<0;1,0>:w { NoDDClr, NoDDChk } // Block origin (1st quadrant) + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_ARGB:ud { NoDDChk } // Block width and height (32x8) + + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud +/* Not needed for validation kernels for now -vK +//Use the mask to determine which pixels shouldn't be over-written + and (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0x00FFFFFF:ud + cmp.ge.f0.0(1) dNULLREG acc0.0<0;1,0>:ud 0x00FFFFFF:ud //Check if all pixels in the block need to be modified + (f0.0) jmpi WriteARGBToDataPort + + //If mask is not all 1's, then load the entire 64x8 block + //so that only those bytes may be modified that need to be (using the mask) + + // Load first block 16x8 packed ARGB 444 --------------------------------------- + or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF00FF00:ud //Check first block + cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud + (f0.0) jmpi SkipFirstBlockMerge //If full mask then skip this block + + send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) mov (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw //use sel instruction - vK + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + $for(0, 0; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1) + } + +SkipFirstBlockMerge: + // Load second block 16x8 packed ARGB 444 --------------------------------------- + or (1) acc0.0<1>:ud udBLOCK_MASK<0;1,0>:ud 0xFF0000FF:ud //Check second block + cmp.e.f0.0 (1) dNULLREG acc0.0<0;1,0>:ud 0xFFFFFFFF:ud + (f0.0) jmpi WriteARGBToDataPort //If full mask then skip this block + + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part + send (8) udSRC_ARGB(0)<1> mMSGHDR udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud // Point to 1st part again + + //Merge the data + mov (1) f0.0:uw ubBLOCK_MASK_V:ub //Load the mask on flag reg + (f0.0) shr (8) rMASK_TEMP<1>:uw uwBLOCK_MASK_H:uw 8:uw //load the mask for second block + (-f0.0) mov (8) rMASK_TEMP<1>:uw 0:uw + + $for(0, 1; <nY_NUM_OF_ROWS; 1, 2) { //take care of the lines in the block, they are different in the src and dest + mov (1) f0.1:uw uwMASK_TEMP(0,%1)<0;1,0> + (-f0.1) mov (8) udDEST_ARGB(%2)<1> udSRC_ARGB(%1) + } +*/ +WriteARGBToDataPort: + // Move packed data to MRF and output + + //Write 1st 4X8 pixels + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + + //Write 2nd 4X8 pixels + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 32:d // Point to 2nd part + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + + //Write 3rd 4X8 pixels + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 64:d // Point to 2nd part + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+2) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + + //Write 4th 4X8 pixels + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + add (1) mMSGHDR.0<1>:d rMSGSRC.0<0;1,0>:d 96:d // Point to 2nd part + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_ARGB(%1*4+3) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_ARGB+nBI_DESTINATION_RGB:ud + +// End of RGB16x8_Save_Y416 diff --git a/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc new file mode 100644 index 0000000..b6b45c4 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB16x8_Save_Y416.inc @@ -0,0 +1,38 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: RGB16x8_Save_Y416.inc +// + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + +// For saving +#define nDPW_BLOCK_SIZE_ARGB nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // ARGB block size 32x8 +#define nDPW_MSG_SIZE_ARGB nMSGLEN_8 // # of MRF's to hold ARGB block data (8) + +// For masking +#undef nDPR_MSG_SIZE_ARGB +#define nDPR_MSG_SIZE_ARGB nRESLEN_8 // # of MRF's to hold ARGB block data (8) +#define rMASK_TEMP REG(r,nTEMP0) +.declare uwMASK_TEMP Base=rMASK_TEMP ElementSize=2 SrcRegion=<8;8,1> Type=uw //1 GRF + +#if (nSRC_REGION==nREGION_1) + // For saving + #define udDEST_ARGB udTOP_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache + //For masking operation + #define udSRC_ARGB udBOT_Y_IO //To hold the destination data that shouldn't be modified + +#elif (nSRC_REGION==nREGION_2) + // For saving + #define udDEST_ARGB udBOT_Y_IO //The output of previous stage is stored here; This is modified and is written to render cache + //For masking operation + #define udSRC_ARGB udTOP_Y_IO //To hold the destination data that shouldn't be modified + +#endif diff --git a/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm b/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm new file mode 100644 index 0000000..063e256 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/RGB_Pack.asm @@ -0,0 +1,40 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +.declare SRC_B Base=REG(r,10) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare SRC_G Base=REG(r,18) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare SRC_R Base=REG(r,26) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare SRC_A Base=REG(r,34) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw + +#define DEST_ARGB ubBOT_ARGB + +#undef nSRC_REGION +#define nSRC_REGION nREGION_2 + + +//Pack directly to mrf as optimization - vK + +$for(0, 0; <8; 1, 2) { +// mov (16) DEST_ARGB(%2,0)<4> SRC_B(%1) { Compr, NoDDClr } // 16 B +// mov (16) DEST_ARGB(%2,1)<4> SRC_G(%1) { Compr, NoDDClr, NoDDChk } // 16 G +// mov (16) DEST_ARGB(%2,2)<4> SRC_R(%1) { Compr, NoDDClr, NoDDChk } // 16 R //these 2 inst can be merged - vK +// mov (16) DEST_ARGB(%2,3)<4> SRC_A(%1) { Compr, NoDDChk } //DEST_RGB_FORMAT<0;1,0>:ub { Compr, NoDDChk } // 16 A + + mov (8) DEST_ARGB(%2, 0)<4> SRC_B(%1) { NoDDClr } // 8 B + mov (8) DEST_ARGB(%2, 1)<4> SRC_G(%1) { NoDDClr, NoDDChk } // 8 G + mov (8) DEST_ARGB(%2, 2)<4> SRC_R(%1) { NoDDClr, NoDDChk } // 8 R + mov (8) DEST_ARGB(%2, 3)<4> SRC_A(%1) { NoDDChk } // 8 A + + mov (8) DEST_ARGB(%2+1,0)<4> SRC_B(%1,8) { NoDDClr } // 8 B + mov (8) DEST_ARGB(%2+1,1)<4> SRC_G(%1,8) { NoDDClr, NoDDChk } // 8 G + mov (8) DEST_ARGB(%2+1,2)<4> SRC_R(%1,8) { NoDDClr, NoDDChk } // 8 R + mov (8) DEST_ARGB(%2+1,3)<4> SRC_A(%1,8) { NoDDChk } // 8 A +} diff --git a/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm b/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm new file mode 100644 index 0000000..6375b0c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/SetupVPKernel.asm @@ -0,0 +1,34 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Modual name: SetupVPKernel.asm +// +// Initial setup for running video-processing kernels +// + +#include "common.inc" + +// +// Now, begin source code.... +// +.code + +#include "Init_All_Regs.asm" + +mov (8) rMSGSRC.0<1>:ud r0.0<8;8,1>:ud // Initialize message payload header with R0 +#if defined (INC_BLENDING) + mul (1) fALPHA_STEP_X:f fSCALING_STEP_RATIO:f fVIDEO_STEP_X:f //StepX_ratio = AlphaStepX / VideoStepX +#endif + +// End of SetupVPKernel + + + + diff --git a/i965_drv_video/shaders/post_processing/Common/common.inc b/i965_drv_video/shaders/post_processing/Common/common.inc new file mode 100644 index 0000000..a0a66a0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/common.inc @@ -0,0 +1,610 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#ifndef COMMON_INC +#define COMMON_INC + +// Module name: common.inc +// +// Common header file for all Video-Processing kernels +// + +.default_execution_size (16) +.default_register_type :ub + +.reg_count_total 80 +.reg_count_payload 4 + + +//========== Common constants ========== + +// Bit position constants +#define BIT0 0x01 +#define BIT1 0x02 +#define BIT2 0x04 +#define BIT3 0x08 +#define BIT4 0x10 +#define BIT5 0x20 +#define BIT6 0x40 +#define BIT7 0x80 +#define BIT8 0x0100 +#define BIT9 0x0200 +#define BIT10 0x0400 +#define BIT11 0x0800 +#define BIT12 0x1000 +#define BIT13 0x2000 +#define BIT14 0x4000 +#define BIT15 0x8000 +#define BIT16 0x00010000 +#define BIT17 0x00020000 +#define BIT18 0x00040000 +#define BIT19 0x00080000 +#define BIT20 0x00100000 +#define BIT21 0x00200000 +#define BIT22 0x00400000 +#define BIT23 0x00800000 +#define BIT24 0x01000000 +#define BIT25 0x02000000 +#define BIT26 0x04000000 +#define BIT27 0x08000000 +#define BIT28 0x10000000 +#define BIT29 0x20000000 +#define BIT30 0x40000000 +#define BIT31 0x80000000 + +#define nGRFWIB 32 // GRF register width in byte +#define nGRFWIW 16 // GRF register width in word +#define nGRFWID 8 // GRF register width in dword + +#define nTOP_FIELD 0 +#define nBOTTOM_FIELD 1 + +#define nPREVIOUS_FRAME 0 // Previous frame +#define nCURRENT_FRAME 1 // Current frame +#define nNEXT_FRAME 2 // Next frame + +#ifdef GT +// GT DI Kernel +#else // ILK +// ILK DI Kernel +#endif + +//=================================== + +//========== Macros ========== +#define REGION(Width,HStride) <Width*HStride;Width,HStride> // Region definition when ExecSize = Width + +#define RegFile(a) a +#define REG(r,n) _REG(RegFile(r),n) +#define _REG(r,n) __REG(r,n) +#define __REG(r,n) r##n.0 +#define REG2(r,n,s) _REG2(RegFile(r),n,s) +#define _REG2(r,n,s) __REG2(r,n,s) +#define __REG2(r,n,s) r##n.##s + +#define dNULLREG null<1>:d +#define wNULLREG null<1>:w + +#define KERNEL_ID(kernel_ID) mov NULLREG kernel_ID:ud + + +#define NODDCLR +#define NODDCLR_NODDCHK +#define NODDCHK + +//#define NODDCLR { NoDDClr } +//#define NODDCLR_NODDCHK { NoDDClr, NoDDChk } +//#define NODDCHK { NoDDChk } + + +//========== Defines ==================== + + +//========== GRF partition ========== +// r0 header : r0 (1 GRF) +// Static parameters : r1 - r5 (5 GRFS) +// Inline parameters : r6 - r7 (2 GRFs) +// MSGSRC : r9 (1 GRF) +// Top IO region : r10 - r33 (24 GRFS 8 for each component Y,U,V 16X8:w) +// Free space : r34 - r55 (22 GRFS) +// Bottom IO region : r56 - r79 (24 GRFS 8 for each component Y,U,V 16X8:w) +//=================================== + + +//========== Static Parameters ========== +// r1 +#define fPROCAMP_C0 r1.0 // DWORD 0, Procamp constant C0 in :f +#define wPROCAMP_C0 r1.0 // DWORD 0, Procamp constant C0 in :w +#define NUMBER_0002 r1.1 // DWORD 0, 0x0002 used in procamp for GT +#define udCP_MessageFormat r1.0 // DWORD 0, bits 2:3 of DWORD. (CE) +#define udCP_StatePointer r1.0 // DWORD 0, bits 31:5 of DWORD.(CE) + +#define ubSRC_CF_OFFSET r1.4 // DWORD 1, byte 0-2. SRC packed color format YUV offset in :ub + +#define ubDEST_RGB_FORMAT r1.8 // DWORD 2, byte 0. Dest RGB color format (0:ARGB FF:XRGB) +#define ubDEST_CF_OFFSET r1.8 // DWORD 2, byte 0-2. Dest packed color format YUV offset in :ub + +#define fPROCAMP_C1 r1.3 // DWORD 3, Procamp constant C1 in :f +#define wPROCAMP_C1 r1.6 // DWORD 3, Procamp constant C1 in :w +#define NUMBER_0100 r1.7 // DWORD 3, 0x0100 used in procamp for GT + +#define fPROCAMP_C2 r1.4 // DWORD 4, Procamp constant C2 in :f +#define wPROCAMP_C2 r1.8 // DWORD 4, Procamp constant C2 in :w + +#define uwSPITCH_DIV2 r1.10 // DWORD 5, byte 0-1. statistics surface pitch divided by 2 + +#define fVIDEO_STEP_Y r1.6 // DWORD 6, :f, AVS normalized reciprocal of Y Scaling factor +#define ubSTMM_SHIFT r1.24 // DWORD 6, byte 0. Amount of right shift for the DI blending equation +#define ubSTMM_MIN r1.25 // DWORD 6, byte 1. Min STMM for DI blending equation +#define ubSTMM_MAX r1.26 // DWORD 6, byte 2. Max STMM for DI blending equation +#define ubTFLD_FIRST r1.27 // DWORD 6, byte 3. Field parity order + +#define fPROCAMP_C5 r1.7 // DWORD 7, Procamp constant C3 in :f +#define wPROCAMP_C5 r1.14 // DWORD 7, Procamp constant C3 in :w + +// r2 +#define fPROCAMP_C3 r2.0 // DWORD 0, Procamp constant C4 in :f +#define wPROCAMP_C3 r2.0 // DWORD 0, Procamp constant C4 in :w + +#define fCSC_C5 r2.2 // DWORD 2. WG+CSC constant C5 +#define wCSC_C5 r2.4 // DWORD 2. WG+CSC constant C5 + +#define fPROCAMP_C4 r2.3 // DWORD 3, Procamp constant C5 in :f +#define wPROCAMP_C4 r2.6 // DWORD 3, Procamp constant C5 in :w + +#define fCSC_C8 r2.4 // DWORD 4. WG+CSC constant C8 +#define wCSC_C8 r2.8 // DWORD 4. WG+CSC constant C8 +#define fCSC_C9 r2.7 // DWORD 7. WG+CSC constant C9 +#define wCSC_C9 r2.14 // DWORD 7. WG+CSC constant C9 + +// r3 +#define fCSC_C0 r3.0 // DWORD 0. WG+CSC constant C0 +#define wCSC_C0 r3.0 // DWORD 0. WG+CSC constant C0 + +#define fSCALING_STEP_RATIO r3.1 // DWORD 1, = Alpha_X_Scaling_Step / Video_X_scaling_Step :f (blending) +#define fALPHA_STEP_X r3.1 // DWORD 1, = 1/Scale X, 0.5 = 2x, in :f (blending) + +#define fALPHA_STEP_Y r3.2 // DWORD 2, = 1/Scale Y, in :f + +#define fCSC_C4 r3.3 // DWORD 3. WG+CSC constant C4 +#define wCSC_C4 r3.6 // DWORD 3. WG+CSC constant C4 +#define fCSC_C1 r3.4 // DWORD 4. WG+CSC constant C1 +#define wCSC_C1 r3.8 // DWORD 4. WG+CSC constant C1 + +#define wSRC_H_ORI_OFFSET r3.10 // DWORD 5, bytes 0,1 :w +#define wSRC_V_ORI_OFFSET r3.11 // DWORD 5, bytes 2,3 :w + +#define dCOLOR_PIXEL r3.6 // DWORD 6. Color pixel for Colorfill + +#define fCSC_C2 r3.6 // DWORD 6. WG+CSC constant C2 +#define wCSC_C2 r3.12 // DWORD 6. WG+CSC constant C2 +#define fCSC_C3 r3.7 // DWORD 7. WG+CSC constant C3 +#define wCSC_C3 r3.14 // DWORD 7. WG+CSC constant C3 + +// r4 +#define fCSC_C6 r4.0 // DWORD 0. WG+CSC constant C6 +#define wCSC_C6 r4.0 // DWORD 0. WG+CSC constant C6 + +#define wFRAME_ENDX r4.2 // DWORD 1, word 0. Horizontal end = Origin+Width (in pixels)(for multiple blocks) +#define wNUM_BLKS r4.3 // DWORD 1, word 1. Number of blocks to process (for multiple blocks) + +#define wCOPY_ORIX r4.5 // DWORD 2, word 1. A copy of X origin (for multiple blocks) +#define uwNLAS_ENABLE r4.4 // DWORD 2, bit 15, NLAS enble bit + +#define fCSC_C7 r4.3 // DWORD 3. WG+CSC constant C7 +#define wCSC_C7 r4.6 // DWORD 3. WG+CSC constant C7 +#define fCSC_C10 r4.4 // DWORD 4. WG+CSC constant C10 +#define wCSC_C10 r4.8 // DWORD 4. WG+CSC constant C10 + +#define fFRAME_VID_ORIX r4.5 // DWORD 5, Frame horizontal origin normalized for scale kernel + +#define fFRAME_ALPHA_ORIX r4.6 // DWORD 6. Normalized alpha horiz origin for the frame + +#define fCSC_C11 r4.7 // DWORD 7. WG+CSC constant C11 +#define wCSC_C11 r4.14 // DWORD 7. WG+CSC constant C11 + +//======================================== + +//========== Inline parameters =========== +// r5 +#define wORIX r5.0 // DWORD 0, byte 0-1. :w, Destination Block Horizontal Origin in pel +#define wORIY r5.1 // DWORD 0, byte 2-3. :w, Destination Block Vertical Origin in pel + +#define fSRC_VID_H_ORI r5.1 // DWORD 1, :f, SRC Y horizontal origin normalized for scale kernel + +#define fSRC_VID_V_ORI r5.2 // DWORD 2, :f, SRC Y vertical origin normalized for scale kernel + +#define fSRC_ALPHA_H_ORI r5.3 // DWORD 3, :f, Normalized alpha horizontal origin + +#define fSRC_ALPHA_V_ORI r5.4 // DWORD 4, :f, Normalized alpha vertical origin + +#define uwALPHA_MASK_X r5.10 // DWORD 5, byte 0-1 :w, H. alpha mask +#define ubALPHA_MASK_Y r5.22 // DWORD 5, byte 2. :ub,V. alpha mask +#define ubBLK_CNT_X r5.23 // DWORD 5, byte 3, :ub, Horizontal Block Count per thread + +#define udBLOCK_MASK r5.6 // DWORD 6 +#define uwBLOCK_MASK_H r5.12 // DWORD 6, byte 0-1 :uw, Block horizontal mask used in non-DWord aligned kernels +#define ubBLOCK_MASK_V r5.26 // DWORD 6, byte 2 :ub, Block vertical mask used in non-DWord aligned kernels +#define ubNUM_BLKS r5.27 // DWORD 6, byte 3, :ub, Total Block Count per thread + +#define fVIDEO_STEP_X r5.7 // DWORD 7. :f, AVS normalized reciprocal of X Scaling factor + +// r6 +#define fVIDEO_STEP_DELTA r6.0 // DWORD 0. :f, AVS normalized delta between 2 adjacent scaling steps (used for non-linear scaling) + + +//====================== Binding table ========================================= + +#if defined(DNDI) + // DNDI Surface Binding Table + //#define nBI_SRC_CURR 0 // Current input frame surface + //#define nBI_SRC_PRIV 1 // Denoised previous input frame surface + //#define nBI_SRC_STAT 2 // Statistics input surface (STMM / Noise motion history) + //#define nBI_DEST_1ST 3 // 1st deinterlaced output frame surface +// #define nBI_DEST_YUV 3 // Dest frame YUV (for DN only) + //#define nBI_DEST_Y 3 // Dest frame Y (for DN only) + //#define nBI_DEST_2ND 4 // 2nd deinterlaced output frame surface + //#define nBI_DEST_DN_CURR 6 // Denoised current output frame surface + //#define nBI_DEST_STAT 7 // Statistics output surface (STMM / Noise motion history) +// #define nBI_DEST_U 8 // Dest frame U (for DN only) +// #define nBI_DEST_V 9 // Dest frame V (for DN only) +// #define nBI_SRC_U 10 // Src frame U (for DN only) +// #define nBI_SRC_V 11 // Src frame V (for DN only) +// #define nBI_SRC_UV 10 // Current src frame for UV + +#endif + +#if defined(INPUT_PL3) + // PL3 Surface Binding Table +// #define nBI_SRC_ALPHA 0 // Alpha +// #define nBI_SRC_Y 1 // Current src frame +// #define nBI_SRC_U 2 // Current src frame +// #define nBI_SRC_V 3 // Current src frame +// #define nBI_DEST_Y 10 // Dest frame +// #define nBI_DEST_U 11 // Dest frame +// #define nBI_DEST_V 12 // Dest frame +// #define nBI_DEST_YUV 7 // Dest frame +// #define nBI_DEST_RGB 7 // same num as BI_DEST_YUV, never used at the same time +#endif + +#if defined(INPUT_PL2) + // PL2 Surface Binding Table +// #define nBI_SRC_ALPHA 0 // Alpha +// #define nBI_SRC_Y 1 // Current src frame for Y + offseted UV +// #define nBI_SRC_YUV 1 // Current src frame for YUV in case of NV12_AVS +// #define nBI_SRC_UV 2 // Current src frame for UV +// #define nBI_DEST_YUV 7 // Current dest frame for Y + offseted UV +// #define nBI_DEST_RGB 7 // same num as BI_DEST_YUV, never used at the same time +// #define nBI_DEST_Y 10 // Dest frame +// #define nBI_DEST_U 11 // Dest frame +// #define nBI_DEST_V 12 // Dest frame +#endif + +#if defined(INPUT_PA) || defined(COLORFILL) + // Packed Surface Binding Table +// #define nBI_SRC_ALPHA 0 // Alpha +// #define nBI_SRC_YUV 1 // Current src frame +// #define nBI_DEST_YUV 3 // Dest frame +// #define nBI_DEST_RGB 3 // same num as BI_DEST_YUV, never used at the same time +#endif + + +//supper binding table +#define nBI_ALPHA_SRC 0 +#define nBI_CURRENT_SRC_YUV 1 +#define nBI_FIELD_COPY_SRC_1_YUV 1 +#define nBI_CURRENT_SRC_Y 1 +#define nBI_FIELD_COPY_SRC_1_Y 1 +#define nBI_CURRENT_SRC_RGB 1 +#define nBI_CURRENT_SRC_UV 2 +#define nBI_FIELD_COPY_SRC_1_UV 2 +#define nBI_CURRENT_SRC_U 2 +#define nBI_FIELD_COPY_SRC_1_U 2 +#define nBI_CURRENT_SRC_V 3 +#define nBI_FIELD_COPY_SRC_1_V 3 +#define nBI_TEMPORAL_REFERENCE_YUV 4 +#define nBI_FIELD_COPY_SRC_2_YUV 4 +#define nBI_TEMPORAL_REFERENCE_Y 4 +#define nBI_FIELD_COPY_SRC_2_Y 4 +#define nBI_CURRENT_SRC_YUV_HW_DI 4 +#define nBI_TEMPORAL_REFERENCE_UV 5 +#define nBI_FIELD_COPY_SRC_2_UV 5 +#define nBI_TEMPORAL_REFERENCE_U 5 +#define nBI_FIELD_COPY_SRC_2_U 5 +#define nBI_DENOISED_PREV_HW_DI 5 +#define nBI_TEMPORAL_REFERENCE_V 6 +#define nBI_FIELD_COPY_SRC_2_V 6 +#define nBI_STMM_HISTORY 6 +#define nBI_DESTINATION_YUV 7 +#define nBI_DESTINATION_RGB 7 +#define nBI_DESTINATION_Y 7 +#define nBI_DESTINATION_UV 8 +#define nBI_DESTINATION_U 8 +#define nBI_DESTINATION_V 9 +#define nBI_DESTINATION_1_YUV 10 +#define nBI_DESTINATION_1_Y 10 +#define nBI_DESTINATION_1_UV 11 +#define nBI_DESTINATION_1_U 11 +#define nBI_DESTINATION_1_V 12 +#define nBI_DESTINATION_2_YUV 13 +#define nBI_DESTINATION_2_Y 13 +#define nBI_DESTINATION_2_UV 14 +#define nBI_DESTINATION_2_U 14 +#define nBI_DESTINATION_2_V 15 +#define nBI_STMM_HISTORY_OUTPUT 20 +#define nBI_TEMPORAL_REFERENCE_YUV_PDI 21 +#define nBI_TEMPORAL_REFERENCE_Y_PDI 21 +#define nBI_TEMPORAL_REFERENCE_UV_PDI 22 +#define nBI_TEMPORAL_REFERENCE_U_PDI 22 +#define nBI_TEMPORAL_REFERENCE_V_PDI 23 +#define nBI_SUBVIDEO_YUV 26 +#define nBI_SUBVIDEO_Y 26 +#define nBI_SUBVIDEO_UV 27 +#define nBI_SUBVIDEO_U 27 +#define nBI_SUBVIDEO_V 28 +#define nBI_SUBPICTURE_YUV 29 +#define nBI_SUBPICTURE_P8 29 +#define nBI_SUBPICTURE_A8 30 +#define nBI_GRAPHIC_YUV 31 +#define nBI_GRAPHIC_P8 31 +#define nBI_GRAPHIC_A8 32 + + + +//========== Planar Sampler State Table Index ========== +#define nSI_SRC_ALPHA 0x000 // Sampler State for Alpha + +//Sampler Index for AVS/IEF messages +#define nSI_SRC_Y 0x400 // Sampler State for Y +#define nSI_SRC_U 0x800 // Sampler State for U +#define nSI_SRC_V 0xC00 // Sampler State for V +#define nSI_SRC_UV 0x800 // For NV12 surfaces +#define nSI_SRC_YUV 0x400 // For Packed surfaces +#define nSI_SRC_RGB 0x400 // For ARGB surfaces + +//Sampler Index for SIMD16 sampler messages +#define nSI_SRC_SIMD16_Y 0x100 // Sampler State for Y +#define nSI_SRC_SIMD16_U 0x200 // Sampler State for U +#define nSI_SRC_SIMD16_V 0x300 // Sampler State for V +#define nSI_SRC_SIMD16_UV 0x200 // For NV12 surfaces +#define nSI_SRC_SIMD16_YUV 0x100 // For Packed surfaces +#define nSI_SRC_SIMD16_RGB 0x100 // For ARGB surfaces + + + +// Common Registers +#define pCF_Y_OFFSET a0.4 // Address register holding Y offset +#define pCF_U_OFFSET a0.5 // Address register holding U offset +#define pCF_V_OFFSET a0.6 // Address register holding V offset + +// #define YUV_ORI ORIX // Used by writing packed data to dport + + +//================= Message Payload Header fields ============================== +#define IDP r0.2:ud // Interface Descriptor Pointer + +//================= Common Message Descriptor TBD add common load and save ===== +// Message descriptor for dataport media write +#ifdef GT + // Message Descriptors + // = 000 0001 (min message len 1 - add later) 00000 (resp len 0) + // 1 (header present 1) 0 0 1010 (media block write) 00000 + // 00000000 (binding table index - set later) + // = 0x02094000 + #define nDPMW_MSGDSC 0x02094000 + #define nDPMR_MSGDSC 0x02098000 // Data Port Media Block Read Message Descriptor + // TBD +#else // ILK + // Message Descriptors + // = 000 0001 (min message len 1 - add later) 00000 (resp len 0) + // 1 (header present 1) 000 0 010 (media block write) 0000 + // 00000000 (binding table index - set later) + // = 0x02082000 + #define nDPMW_MSGDSC 0x02082000 // Data Port Media Block Write Message Descriptor + #define nDPMR_MSGDSC 0x0208A000 // Data Port Media Block Read Message Descriptor +#endif + +// Message Length defines +#define nMSGLEN_1 0x02000000 // Message Length of 1 GRF for Send +#define nMSGLEN_2 0x04000000 // Message Length of 2 GRF for Send +#define nMSGLEN_4 0x08000000 // Message Length of 4 GRF for Send +#define nMSGLEN_8 0x10000000 // Message Length of 8 GRF for Send + +// Response Length defines +#define nRESLEN_1 0x00100000 // Message Response Length of 1 GRF from Send +#define nRESLEN_2 0x00200000 // Message Response Length of 2 GRF from Send +#define nRESLEN_3 0x00300000 // Message Response Length of 3 GRF from Send +#define nRESLEN_4 0x00400000 // Message Response Length of 4 GRF from Send +#define nRESLEN_5 0x00500000 // Message Response Length of 5 GRF from Send +#define nRESLEN_8 0x00800000 // Message Response Length of 8 GRF from Send +#define nRESLEN_9 0x00900000 // Message Response Length of 9 GRF from Send +#define nRESLEN_11 0x00B00000 // Message Response Length of 11 GRF from Send +#define nRESLEN_12 0x00C00000 // Message Response Length of 12 GRF from Send +#define nRESLEN_16 0x01000000 // Message Response Length of 16 GRF from Send + +// Block Width and Height Size defines +#define nBLOCK_WIDTH_4 0x00000003 // Block Width 4 +#define nBLOCK_WIDTH_5 0x00000004 // Block Width 5 +#define nBLOCK_WIDTH_8 0x00000007 // Block Width 8 +#define nBLOCK_WIDTH_9 0x00000008 // Block Width 9 +#define nBLOCK_WIDTH_12 0x0000000B // Block Width 12 +#define nBLOCK_WIDTH_16 0x0000000F // Block Width 16 +#define nBLOCK_WIDTH_20 0x00000013 // Block Width 20 +#define nBLOCK_WIDTH_32 0x0000001F // Block Width 32 +#define nBLOCK_HEIGHT_1 0x00000000 // Block Height 1 +#define nBLOCK_HEIGHT_2 0x00010000 // Block Height 2 +#define nBLOCK_HEIGHT_4 0x00030000 // Block Height 4 +#define nBLOCK_HEIGHT_5 0x00040000 // Block Height 5 +#define nBLOCK_HEIGHT_8 0x00070000 // Block Height 8 + +// Extended Message Descriptors +#define nEXTENDED_MATH 0x1 +#define nSMPL_ENGINE 0x2 +#define nMESSAGE_GATEWAY 0x3 +#define nDATAPORT_READ 0x4 +#define nDATAPORT_WRITE 0x5 +#define nURB 0x6 +#define nTS_EOT 0x27 // with End-Of-Thread bit ON + +// Common message descriptors: +#ifdef GT + #define nEOT_MSGDSC 0x02000010 // End of Thread Message Descriptor + #define IF_NULL null:uw null:uw null:uw //for different if instructions on ILK and Gen6 +#else //ILK + #define nEOT_MSGDSC 0x02000000 // End of Thread Message Descriptor + #define IF_NULL +#endif + + +//===================== Math Function Control =================================== +#define mfcINV 0x1 // reciprocal +#define mfcLOG 0x2 // log +#define mfcEXP 0x3 // exponent +#define mfcSQRT 0x4 // square root +#define mfcRSQ 0x5 // reciprocal square root +#define mfcSIN 0x6 // sine (in radians) +#define mfcCOS 0x7 // cosine (in radians) +#define mfcSINCOS 0x8 // dst0 = sin of src0, dst1 = cosine of src0 (in radians) - GT+ ONLY +#define mfcPOW 0xA // abs(src0) raised to the src1 power +#define mfcINT_DIV_QR 0xB // return quotient and remainder +#define mfcINT_DIV_Q 0xC // return quotient +#define mfcINT_DIV_R 0xD // return remainder + + +//=================== Message related registers ================================= + +#ifdef GT + #define udDUMMY_NULL +#else // _ILK + #define udDUMMY_NULL null:ud // Used in send inst as src0 +#endif + + +//----------- Message Registers ------------ +#define mMSGHDR m1 // Message Payload Header +#define mMSGHDRY m1 // Message Payload Header register for Y data +#define mMSGHDRU m2 // Message Payload Header register for U data +#define mMSGHDRV m3 // Message Payload Header register for V data +#define mMSGHDRYA m4 // Second Message Payload Header register for Y data +#define mMSGHDRH m5 // Message Payload Header register for motion history +#define mMSGHDRY1 m1 // Message Payload Header register for first Y data +#define mMSGHDRY2 m2 // Message Payload Header register for second Y data +#define mMSGHDRY3 m3 // Message Payload Header register for third Y data +#define mMSGHDRY4 m4 // Message Payload Header register for fourth Y data +#define mMSGHDRY5 m5 // Message Payload Header register for fifth Y data +#define mMSGHDRY6 m6 // Message Payload Header register for sixth Y data +#define mMSGHDR_EOT m15 // Dummy Message Register for EOT + +#define rMSGSRC r8 // Message source register +#define pMSGDSC a0.0:ud // Message Descriptor register (type DWORD) + +#define udMH_ORI rMSGSRC.0 // Data Port Media Block R/W message header block offset +#define udMH_ORIX rMSGSRC.0 // Data Port Media Block R/W message header X offset +#define udMH_ORIY rMSGSRC.1 // Data Port Media Block R/W message header Y offset +#define udMH_SIZE rMSGSRC.2 // Data Port Media Block R/W message header block width & height + +// M2 - M9 for message data payload +.declare mubMSGPAYLOAD Base=m2 ElementSize=1 SrcRegion=REGION(16,1) Type=ub +.declare muwMSGPAYLOAD Base=m2 ElementSize=2 SrcRegion=REGION(16,1) Type=uw +.declare mudMSGPAYLOAD Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare mfMSGPAYLOAD Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=f + +//=================== End of thread instruction =========================== +#ifdef GT + #define END_THREAD mov (8) mMSGHDR_EOT<1>:ud r0.0<8;8,1>:ud \n\ + send (1) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC +#else // ILK This should be changed to 1 instruction; I have tested it and it works - vK + #define END_THREAD mov (8) mMSGHDR_EOT<1>:ud r0.0<8;8,1>:ud \n\ + send (1) dNULLREG mMSGHDR_EOT udDUMMY_NULL nTS_EOT nEOT_MSGDSC:ud +#endif + + +//======================================================================= +// Region declarations for SRC and DEST as TOP and BOT + +// Common I/O regions +#define nREGION_1 1 +#define nREGION_2 2 + +//*** These region base GRFs are fixed regardless planar/packed, and data alignment. +//*** Each kernel is responsible to select the correct region declaration below. +//*** YUV regions are not necessarily next to each other. +#define nTOP_Y 10 // r10 - r17 (8 GRFs) +#define nTOP_U 18 // r18 - r25 (8 GRFs) +#define nTOP_V 26 // r26 - r33 (8 GRFs) + +#define nBOT_Y 56 // r56 - r63 (8 GRFs) +#define nBOT_U 64 // r64 - r71 (8 GRFs) +#define nBOT_V 72 // r72 - r79 (8 GRFs) + +// Define temp space for any usages +#define nTEMP0 34 +#define nTEMP1 35 +#define nTEMP2 36 +#define nTEMP3 37 +#define nTEMP4 38 +#define nTEMP5 39 +#define nTEMP6 40 +#define nTEMP7 41 +#define nTEMP8 42 +#define nTEMP10 44 +#define nTEMP12 46 +#define nTEMP14 48 +#define nTEMP16 50 +#define nTEMP17 51 +#define nTEMP18 52 + +#define nTEMP24 58 + +// Common region 1 +.declare ubTOP_Y Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub +.declare ubTOP_U Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub +.declare ubTOP_V Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub + +.declare uwTOP_Y Base=REG(r,nTOP_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw +.declare uwTOP_U Base=REG(r,nTOP_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare uwTOP_V Base=REG(r,nTOP_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare ub2TOP_Y Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub +.declare ub2TOP_U Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub +.declare ub2TOP_V Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub + +.declare ub4TOP_Y Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub +.declare ub4TOP_U Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,4) Type=ub +.declare ub4TOP_V Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,4) Type=ub + +.declare ubTOP_ARGB Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub + +// Used by "send" instruction +.declare udTOP_Y_IO Base=REG(r,nTOP_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare udTOP_U_IO Base=REG(r,nTOP_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare udTOP_V_IO Base=REG(r,nTOP_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud + +// Common region 2 +.declare ubBOT_Y Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub +.declare ubBOT_U Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub +.declare ubBOT_V Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub + +.declare uwBOT_Y Base=REG(r,nBOT_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw +.declare uwBOT_U Base=REG(r,nBOT_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare uwBOT_V Base=REG(r,nBOT_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare ub2BOT_Y Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub +.declare ub2BOT_U Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub +.declare ub2BOT_V Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub + +.declare ubBOT_ARGB Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub + +// Used by "send" instruction +.declare udBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare udBOT_U_IO Base=REG(r,nBOT_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare udBOT_V_IO Base=REG(r,nBOT_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud + +// End of common.inc + +#endif // COMMON_INC diff --git a/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm b/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm new file mode 100644 index 0000000..36c4be6 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/readSampler16x1.asm @@ -0,0 +1,55 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: readSampler16x1.asm +// +// Read one row of pix through sampler +// + + + +//#define SAMPLER_MSG_DSC 0x166A0000 // ILK Sampler Message Descriptor + + + +// Send Message [DevILK] Message Descriptor +// MBZ MsgL=5 MsgR=8 H MBZ SIMD MsgType SmplrIndx BindTab +// 000 0 101 0 1000 1 0 10 0000 0000 00000000 +// 0 A 8 A 0 0 0 0 + +// MsgL=1+2*2(u,v)=5 MsgR=8 + +#define SAMPLER_MSG_DSC 0x0A8A0000 // ILK Sampler Message Descriptor + + + + + + + + // Assume MSGSRC is set already in the caller + //mov (8) rMSGSRC.0<1>:ud 0:ud // Unused fileds + + + + // Read 16 sampled pixels and stored them in float32 in 8 GRFs + // 422 data is expanded to 444, return 8 GRF in the order of RGB- (UYV-). + // 420 data has three surfaces, return 8 GRF. Valid is always in the 1st GRF when in R8. Make sure no overwrite the following 3 GRFs. + // alpha data is expanded to 4444, return 8 GRF in the order of RGBA (UYVA). + + mov(16) mMSGHDR<1>:uw rMSGSRC<16;16,1>:uw + send (16) DATABUF(0)<1> mMSGHDR udDUMMY_NULL 0x2 SAMPLER_MSG_DSC+SAMPLER_IDX+BINDING_IDX:ud + + + + + + + diff --git a/i965_drv_video/shaders/post_processing/Common/undefall.inc b/i965_drv_video/shaders/post_processing/Common/undefall.inc new file mode 100644 index 0000000..241bd70 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Common/undefall.inc @@ -0,0 +1,65 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Modual name: undefall.inc +// +// undefine all global symbol for new process +// + +//Source definitions +#undef ubSRC_Y +#undef ubSRC_U +#undef ubSRC_V + +#undef ub2SRC_Y +#undef ub2SRC_U +#undef ub2SRC_V + +#undef ub4SRC_Y +#undef ub4SRC_U +#undef ub4SRC_V + +#undef uwSRC_Y +#undef uwSRC_U +#undef uwSRC_V + +#undef udSRC_Y +#undef udSRC_U +#undef udSRC_V + +#undef udSRC_YUV +#undef nSRC_YUV_REG + +//Destination definitions +#undef ubDEST_Y +#undef ubDEST_U +#undef ubDEST_V + +#undef ub2DEST_Y +#undef ub2DEST_U +#undef ub2DEST_V + +#undef ub4DEST_Y +#undef ub4DEST_U +#undef ub4DEST_V + +#undef uwDEST_Y +#undef uwDEST_U +#undef uwDEST_V + +#undef udDEST_Y +#undef udDEST_U +#undef udDEST_V + +#undef udDEST_YUV +#undef nDEST_YUV_REG +#undef ubDEST_ARGB + +// End of undefall.inc diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc new file mode 100644 index 0000000..cbed61a --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_IEF.inc @@ -0,0 +1,108 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: AVS_IEF.inc + +#ifndef _AVS_INF_INC_ +#define _AVS_INF_INC_ + +#include "undefall.inc" //Undefine the SRC and DEST sysmbols + + // Message Header + // m0.7 31:0 Debug + // m0.6 31:0 Debug + // m0.5 31:0 Ignored + // m0.4 31:0 Ignored + // m0.3 31:0 Ignored + // m0.2 31:16 Ignored + // 15 Alpha Write Channel Mask enable=0, disable=1 + // 14 Blue Write Channel Mask (V) + // 13 Green Write Channel Mask (Y) + // 12 Red Write Channel Mask (U) + // 11:0 Ignored + // m0.1 Ignored + // m0.0 Ignored + +#define mAVS_8x8_HDR m0 // Message Header +#define mAVS_PAYLOAD m1 // Message Payload Header + +#define mAVS_8x8_HDR_2 m2 // Message Header +#define mAVS_PAYLOAD_2 m3 // Message Payload Header + +#define mAVS_8x8_HDR_UV m2 // Message Header +#define mAVS_PAYLOAD_UV m3 // Message Payload Header + +#define rAVS_8x8_HDR rMSGSRC // Mirror of Message Header +#define rAVS_PAYLOAD r9 // Mirror of Message Payload Header + + // AVS payload + // m1.7 Ignored + // m1.6 Pixel 0 V Address ---> ORIY (Y0) + // m1.5 Delta V ---> Step Y + // m1.4 Ignored + // m1.3 Ignored + // m1.2 Pixel 0 U Address ---> ORIX (X0) + // m1.1 U 2nd Derivative ---> NLAS dx + // m1.0 Delta U ---> Step X + + // Sampler Message Descriptor + // 31:29 Reserved 000 + // 28:25 Message length 0010 + // 24:20 Response length xxxxx ---> 4GRFs for each enabled channel + // 19 Header Present 1 + // 18 MBZ 0 + // 17:16 SIMD Mode 11 ---> SIMD64 + // 15:12 Message Type 0011 ---> sample_8x8 + // 11:8 Sampler Index xxxx + // 7:0 Binding Table Index xxxxxxxx +#define nAVS_MSG_DSC_1CH 0x044BB000 +#define nAVS_MSG_DSC_2CH 0x048BB000 +#define nAVS_MSG_DSC_3CH 0x04CBB000 +#define nAVS_MSG_DSC_4CH 0x050BB000 + +#define nAVS_RED_CHANNEL_ONLY 0x0000E000 // Enable Red channel only +#define nAVS_GREEN_CHANNEL_ONLY 0x0000D000 // Enable Green channel only +#define nAVS_RED_BLUE_CHANNELS 0x0000A000 // Enable Red and Blue channels +#define nAVS_RGB_CHANNELS 0x00008000 // Enable RGB(YUV) channels +#define nAVS_ALL_CHANNELS 0x00000000 // Enable all channels (ARGB\AYUV) + + + +.declare ubAVS_RESPONSE Base=REG(r,nTEMP8) ElementSize=1 SrcRegion=REGION(16,1) Type=ub +.declare uwAVS_RESPONSE Base=REG(r,nTEMP8) ElementSize=2 SrcRegion=REGION(16,1) Type=uw + +.declare ubAVS_RESPONSE_2 Base=REG(r,nTEMP24) ElementSize=1 SrcRegion=REGION(16,1) Type=ub +.declare uwAVS_RESPONSE_2 Base=REG(r,nTEMP24) ElementSize=2 SrcRegion=REGION(16,1) Type=uw + + +#if (nSRC_REGION==nREGION_2) + #define uwDEST_Y uwBOT_Y + #define uwDEST_U uwBOT_U + #define uwDEST_V uwBOT_V + + #define ubDEST_Y ubBOT_Y + + #undef nSRC_REGION + #define nSRC_REGION nREGION_2 + +#else //(nSRC_REGION==nREGION_1) + #define uwDEST_Y uwTOP_Y + #define uwDEST_U uwTOP_U + #define uwDEST_V uwTOP_V + + #define ubDEST_Y ubTOP_Y + + #undef nSRC_REGION + #define nSRC_REGION nREGION_1 + +#endif + + +#endif //_AVS_INF_INC_ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm new file mode 100644 index 0000000..d45ce44 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupFirstBlock.asm @@ -0,0 +1,35 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//------------------------------------------------------------------------------ +// AVS_SetupFirstBlock.asm +//------------------------------------------------------------------------------ + + // Setup Message Header +// mov (8) mAVS_8x8_HDR<1>:ud rMSGSRC<8;8,1>:ud + + // Check NLAS Enable bit + and.z.f0.0 (1) wNULLREG uwNLAS_ENABLE:uw BIT15:uw + (f0.0)mov (1) fVIDEO_STEP_DELTA:f 0.0:f + + // Setup Message Payload Header for 1st block of Media Sampler 8x8 + mov (1) rAVS_PAYLOAD.0:f fVIDEO_STEP_DELTA:f //NLAS dx + mov (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f //Step X + mov (1) rAVS_PAYLOAD.5:f fVIDEO_STEP_Y:f //Step Y + mov (2) rAVS_PAYLOAD.2<4>:f fSRC_VID_H_ORI<2;2,1>:f //Orig X and Y + + + + + + + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm new file mode 100644 index 0000000..8f125dc --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/AVS_SetupSecondBlock.asm @@ -0,0 +1,27 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//------------------------------------------------------------------------------ +// AVS_SetupSecondBlock.asm +//------------------------------------------------------------------------------ + + //NLAS calculations for 2nd block of Media Sampler 8x8: + // X(i) = X0 + dx*i + ddx*i*(i-1)/2 ==> X(8) = X0 + dx*8 +ddx*28 + // dx(i)= dx(0) + ddx*i ==> dx(8)= dx + ddx*8 + + // Calculating X(8) + mov (1) acc0.2<1>:f fSRC_VID_H_ORI:f + mac (1) acc0.2<1>:f fVIDEO_STEP_X:f 8.0:f + mac (1) rAVS_PAYLOAD.2:f fVIDEO_STEP_DELTA:f 28.0:f + + // Calculating dx(8) + mov (1) acc0.1<1>:f fVIDEO_STEP_X:f + mac (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_DELTA:f 8.0:f + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc new file mode 100644 index 0000000..62f84c0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI.inc @@ -0,0 +1,194 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: DI.inc + +#ifdef GT +// GT DI Kernel +#else // ILK +// ILK DI Kernel +#endif + + +//--------------------------------------------------------------------------- +// Binding table indices +//--------------------------------------------------------------------------- +#define nBIDX_DI_PRV 10 // Previous DI-ed frame +#define nBIDX_DI_CUR 13 // Current DI-ed frame +#define nBIDX_DN 7 // Denoised frame +#define nBIDX_STAT 20 // Statistics +#define nBIDX_DI_Source 4 // Source Surface + + +//--------------------------------------------------------------------------- +// Message descriptors +//--------------------------------------------------------------------------- +// Extended message descriptor +#define nSMPL_ENGINE 0x2 +#define nDATAPORT_WRITE 0x5 +#define nTS_EOT 0x27 // with End-Of-Thread bit ON + + // Message descriptor for end-of-thread + // = 000 0001 (message len) 00000 (resp len) + // 0 (header present 0) 00000000000000 0 (URB dereferenced) 0000 +#define nEOT_MSGDSC 0x02000000 + + // Message descriptor for sampler read + // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11) + // 1 (header present 1) 0 11 (SIMD32/64 mode) + // 1000 (message type) 0000 (DI state index) + // 00000000 (binding table index - set later) + // = 0x040b8000 + +// comment begin +// The following is commented out because of walker feature +// It corresponds to the #ifdef GT #else and #endif +//#define nSMPL_MSGDSC 0x040b8000 +//#define nSMPL_RESP_LEN_DI 0x00c00000 // 12 +//#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5 +//#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9 +//#define nSMPL_RESP_LEN_NODN 0x00900000 // 9 +//#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11 +// comment end + +#ifdef GT + +#define nSMPL_MSGDSC 0x040b8000 +#define nSMPL_RESP_LEN_DI 0x00c00000 // 12 +#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5 //DI disable, the XY stored in 5th GRF, no impact to return length +#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9 //DI disable, the XY stored in 5th GRF, no impact to return length +#define nSMPL_RESP_LEN_NODN 0x00a00000 // 10 //NO DN, originally use 9, now we need use 10 to store the XY with walker +#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11 + +#else + +#define nSMPL_MSGDSC 0x040b8000 +#define nSMPL_RESP_LEN_DI 0x00c00000 // 12 +#define nSMPL_RESP_LEN_NODI_PL 0x00500000 // 5 +#define nSMPL_RESP_LEN_NODI_PA 0x00900000 // 9 +#define nSMPL_RESP_LEN_NODN 0x00900000 // 9 +#define nSMPL_RESP_LEN_PDI 0x00b00000 // 11 + +#endif + + // Message descriptor for dataport media write +#ifdef GT + // = 000 0000 (message len - set later) 00000 (resp len 0) + // 1 (header present 1) 0 0 1010 (media block write) 00000 + // 00000000 (binding table index - set later) + // = 0x00094000 +#define nDPMW_MSGDSC 0x00094000 +#else // ILK + // = 000 0000 (message len - set later) 00000 (resp len 0) + // 1 (header present 1) 000 0 010 (media block write) 0000 + // 00000000 (binding table index - set later) + // = 0x00082000 +#define nDPMW_MSGDSC 0x00082000 +#endif +#define nDPMW_MSG_LEN_STMM 0x04000000 // 2 - STMM +#define nDPMW_MSG_LEN_DH 0x04000000 // 2 - Denoise history +#define nDPMW_MSG_LEN_PA_DN 0x0a000000 // 5 - Denoised output +#define nDPMW_MSG_LEN_PA_NODI 0x12000000 // 9 - Denoised output - denoise only - DI disabled +#define nDPMW_MSG_LEN_PL_DN 0x06000000 // 3 - Denoised output +#define nDPMW_MSG_LEN_PL_NODI 0x0a000000 // 5 - Denoised output - denoise only - DI disabled +#define nDPMW_MSG_LEN_DI 0x0a000000 // 5 - DI output + + +//--------------------------------------------------------------------------- +// Static and inline parameters +//--------------------------------------------------------------------------- +// Static parameters +.declare ubTFLD_FIRST Base=r1.27 ElementSize=1 Type=ub // top field first +.declare ubSRCYUVOFFSET Base=r1.4 ElementSize=1 Type=ub // source packed format +.declare ubDSTYUVOFFSET Base=r1.8 ElementSize=1 Type=ub // destination packed format +.declare uwSPITCH_DIV2 Base=r1.10 ElementSize=2 Type=uw // statistics surface pitch divided by 2 + +// Inline parameters +.declare uwXORIGIN Base=r5.0 ElementSize=2 Type=uw // X and Y origin +.declare uwYORIGIN Base=r5.1 ElementSize=2 Type=uw + + +//--------------------------------------------------------------------------- +// Kernel GRF variables +//--------------------------------------------------------------------------- +// Message response (Denoised & DI-ed pixels & statistics) +.declare dRESP Base=r8 ElementSize=4 Type=d // Response message (12 or 5 or 11) +.declare ubRESP Base=r8 ElementSize=1 Type=ub + +.declare dSTMM Base=r16 ElementSize=4 Type=d // STMM +.declare ubDN_HIST_NODI Base=r12 ElementSize=1 Type=ub // Denoise history data (DI disabled) +.declare ubDN_HIST_DI Base=r17 ElementSize=1 Type=ub // Denoise history data (DI enabled) +.declare uwRETURNED_POSITION_DI Base=r17 ElementSize=2 Type=uw // XY_Return_Data (DI enabled) +.declare uwRETURNED_POSITION_DN Base=r12 ElementSize=2 Type=uw // XY_Return_Data (DI disabled) + +.declare ub1ST_FLD_DN Base=r12 ElementSize=1 Type=ub // 1st field Denoised data (DI enabled) +.declare d1ST_FLD_DN Base=r12 ElementSize=4 Type=d +.declare ub2ND_FLD_DN Base=r18 ElementSize=1 Type=ub // 2nd field Denoised data (DI enabled) +.declare d2ND_FLD_DN Base=r18 ElementSize=4 Type=d +.declare ubPRV_DI Base=r8 ElementSize=1 Type=ub // Previous frame DI (DI enabled) +.declare ubCUR_DI Base=r12 ElementSize=1 Type=ub // Previous frame DI (DI enabled) + +// Packed denoised output +.declare ubDN_YUV Base=r22 ElementSize=1 Type=ub // Denoised YUV422 +.declare dDN_YUV Base=r22 ElementSize=4 Type=d +#define npDN_YUV 704 // = 22*32 = 0x280 + +// Packed DI output +.declare dDI_YUV_PRV Base=r32 ElementSize=4 Type=d // Previous frame DI output +.declare dDI_YUV_CUR Base=r36 ElementSize=4 Type=d // Current frame DI output +#define npDI_YUV 1024 // = 32*32 = 0x + +// For packed output +#define p422_YOFFSET a0.2 +#define p422_UOFFSET a0.3 +#define p422_VOFFSET a0.4 +#define pDN_TFLDSRC a0.6 +#define pDN_BFLDSRC a0.7 +#define npRESP 192 // = 6*32 + +// Message source +.declare udMSGSRC Base=r70 ElementSize=4 Type=ud +.declare uwMSGSRC Base=r70 ElementSize=2 Type=uw +.declare dMSGSRC Base=r70 ElementSize=4 Type=d + + +//--------------------------------------------------------------------------- +// Kernel MRF variables +//--------------------------------------------------------------------------- +#define mMSGHDR_SMPL m1 // Sampler response: m1~m2 +.declare mudMSGHDR_SMPL Base=m1 ElementSize=4 Type=ud +.declare muwMSGHDR_SMPL Base=m1 ElementSize=2 Type=uw +#define mMSGHDR_DN m3 // Denoise output: m3~m7 for PA, m3~m5 for PL +.declare mdMSGHDR_DN Base=m3 ElementSize=4 Type=d +#define mMSGHDR_STAT m8 // Statistics output: m8~m9 +.declare mdMSGHDR_STAT Base=m8 ElementSize=4 Type=d +.declare mubMSGHDR_STAT Base=m8 ElementSize=1 Type=ub +#define mMSGHDR_DI m10 // DI output: m10~m14 +.declare mdMSGHDR_DI Base=m10 ElementSize=4 Type=d +#define mMSGHDR_EOT m15 // EOT + +#ifdef GT +#define MSGSRC +#else +#define MSGSRC null:ud +#endif + + +//--------------------------------------------------------------------------- +// End of thread instruction +//--------------------------------------------------------------------------- +#ifdef GT +#define END_THREAD send (8) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC +#else // ILK +#define END_THREAD send (8) null<1>:d mMSGHDR_EOT null:ud nTS_EOT nEOT_MSGDSC +#endif + + +// end of DI.inc diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm new file mode 100644 index 0000000..ae8ff85 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_Hist_Save.asm @@ -0,0 +1,24 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +// Write denoise history to memory +shr (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w 2:w NODDCLR // X,Y origin / 4 +add (1) rMSGSRC.0<1>:ud rMSGSRC.0<0;1,0>:ud uwSPITCH_DIV2<0;1,0>:uw NODDCLR_NODDCHK // Add pitch to X origin +mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_HIST:ud NODDCHK // block width and height (4x2) + +mov (8) mMSGHDR_HIST<1>:ud rMSGSRC.0<8;8,1>:ud // message header +mov (1) mudMSGHDR_HIST(1)<1> udRESP(nDI_HIST_OFFSET,0)<0;1,0> // Move denoise history to MRF + +send (8) dNULLREG mMSGHDR_HIST udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud + + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm new file mode 100644 index 0000000..f4e2fe7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DI_SAVE_PA.asm @@ -0,0 +1,56 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // H. block origin need to be doubled + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DI:ud NODDCHK // Block width and height (32x8) + + + add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub nDEST_YUV_REG*nGRFWIB:w // Initial Y,U,V offset in YUV422 block + + // Pack 2nd field Y + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) r[pCF_Y_OFFSET, %1*nGRFWIB]<2> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + } + // Pack 1st field Y + $for(0; <nY_NUM_OF_ROWS; 1) { + mov (16) r[pCF_Y_OFFSET, %1+4*nGRFWIB]<2> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + } + // Pack 2nd field U + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_U_OFFSET, %1*nGRFWIB]<4> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + } + // Pack 1st field U + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_U_OFFSET, %1+4*nGRFWIB]<4> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + } + // Pack 2nd field V + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_V_OFFSET, %1*nGRFWIB]<4> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //Vpixels + } + // Packs1st field V + $for(0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_V_OFFSET, %1+4*nGRFWIB]<4> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //Vpixels + } + + //save the previous frame + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + $for(0; <4; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1)REGION(8,1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_1_YUV:ud + + //save the current frame + mov (8) mMSGHDR<1>:ud rMSGSRC<8;8,1>:ud + $for(0; <4; 1) { + mov (8) mudMSGPAYLOAD(%1)<1> udDEST_YUV(%1+4)REGION(8,1) + } + send (8) dNULLREG mMSGHDR udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPW_MSG_SIZE_DI+nBI_DESTINATION_2_YUV:ud + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc new file mode 100644 index 0000000..3258756 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI.inc @@ -0,0 +1,162 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Module name: DI.inc + +#ifdef GT +// GT DI Kernel +#else // ILK +// ILK DI Kernel +#endif + +#include "undefall.inc" + +//--------------------------------------------------------------------------- +// Message descriptors +//--------------------------------------------------------------------------- +// Extended message descriptor + // Message descriptor for sampler read +// // = 000 0010 (message len 2) 00000 (resp len - set later, 12 or 5 or 11) +// // 1 (header present 1) 0 11 (SIMD32/64 mode) +// // 1000 (message type) 0000 (DI state index) +// // 00000000 (binding table index - set later) +// // = 0x040b8000 +#define nSMPL_DI_MSGDSC 0x040b8000 + +#define nSMPL_RESP_LEN_DNDI nRESLEN_12 // 12 - for DN + DI Alg +#define nSMPL_RESP_LEN_DN_PL nRESLEN_5 // 5 - for DN Planar Alg +#define nSMPL_RESP_LEN_DN_PA nRESLEN_9 // 9 - for DN Packed Alg +#define nSMPL_RESP_LEN_DI nRESLEN_9 // 9 - for DI Only Alg +#define nSMPL_RESP_LEN_PDI nRESLEN_11 // 11 - for Partial DI Alg + +// Attention: The Message Length is The Number of GRFs with Data Only, without the Header +#define nDPMW_MSG_LEN_STMM nMSGLEN_1 // 1 - For STMM Save +#define nDPMW_MSG_LEN_HIST nMSGLEN_1 // 1 - For Denoise History Save +#define nDPMW_MSG_LEN_PA_DN_DI nMSGLEN_4 // 4 - For DN Curr Save +#define nDPMW_MSG_LEN_PA_DN_NODI nMSGLEN_8 // 8 - For DN Curr Save (denoise only - DI disabled) +#define nDPMW_MSG_LEN_PL_DN_DI nMSGLEN_2 // 2 - For DN Curr Save +#define nDPMW_MSG_LEN_PL_DN_NODI nMSGLEN_4 // 4 - For DN Curr Save (denoise only - DI disabled) + +#define nDPW_BLOCK_SIZE_STMM nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // Y block size 8x4 + +#undef nDPW_BLOCK_SIZE_DI +#undef nDPW_MSG_SIZE_DI +#define nDPW_BLOCK_SIZE_DI nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4 +#define nDPW_MSG_SIZE_DI nMSGLEN_4 + + +//--------------------------------------------------------------------------- +// Kernel GRF variables +//--------------------------------------------------------------------------- +// Defines for DI enabled +#define nDI_PREV_FRAME_LUMA_OFFSET 0 +#define nDI_PREV_FRAME_CHROMA_OFFSET 2 +#define nDI_CURR_FRAME_LUMA_OFFSET 4 +#define nDI_CURR_FRAME_CHROMA_OFFSET 6 +#define nDI_STMM_OFFSET 8 +#define nDI_HIST_OFFSET 9 +#define nDI_CURR_2ND_FIELD_LUMA_OFFSET 10 +#define nDI_CURR_2ND_FIELD_CHROMA_OFFSET 11 + +// Defines for DI disabled +#define nNODI_LUMA_OFFSET 0 +#define nNODI_HIST_OFFSET 4 +#define nNODI_CHROMA_OFFSET 5 + +#ifdef DI_ENABLE + #define nHIST_OFFSET nDI_HIST_OFFSET + #undef nY_NUM_OF_ROWS + #define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame) + #undef nUV_NUM_OF_ROWS + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + +#endif + +#ifdef DI_DISABLE + #define nHIST_OFFSET nNODI_HIST_OFFSET +#endif + +#if (nSRC_REGION==nREGION_2) + #define ub2SRC_Y ub2BOT_Y + #define ub2SRC_U ub2BOT_U + #define ub2SRC_V ub2BOT_V + #define uwDEST_Y uwBOT_Y + #define uwDEST_U uwBOT_U + #define uwDEST_V uwBOT_V + #define nDEST_YUV_REG nTOP_Y + #define udDEST_YUV udTOP_Y_IO + + #define nRESP nTEMP0 // DI return message requires 12 GRFs + #define nDN_YUV nTOP_Y // Space for Packing DN for next run requires 8 GRFs + + #undef nSRC_REGION + #define nSRC_REGION nREGION_2 + +#else + #define ub2SRC_Y ub2TOP_Y + #define ub2SRC_U ub2TOP_U + #define ub2SRC_V ub2TOP_V + #define uwDEST_Y uwTOP_Y + #define uwDEST_U uwTOP_U + #define uwDEST_V uwTOP_V + #define nDEST_YUV_REG nBOT_Y + #define udDEST_YUV udBOT_Y_IO + #define nRESP nTEMP0 // DI return message requires 12 GRFs + #define nDN_YUV nBOT_Y // Space for Packing DN for next run requires 8 GRFs + + #undef nSRC_REGION + #define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel + +#endif + + + + + + + + + +// Message response (Denoised & DI-ed pixels & statistics) +.declare udRESP Base=REG(r,nRESP) ElementSize=4 SrcRegion=REGION(8,1) DstRegion=<1> Type=ud +.declare ubRESP Base=REG(r,nRESP) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub + +// For Denoised Curr Output (Used as Priv in Next Run) +.declare ubDN_YUV Base=REG(r,nDN_YUV) ElementSize=1 Type=ub +.declare udDN_YUV Base=REG(r,nDN_YUV) ElementSize=4 Type=ud +#define npDN_YUV nDN_YUV*nGRFWIB + +// For DI Process Output (1st and 2nd Frames Output) +//.declare udDI_YUV_PRIV Base=REG(r,nTEMP0) ElementSize=4 Type=ud // Previous frame DI output +//.declare udDI_YUV_CURR Base=REG(r,nTEMP0) ElementSize=4 Type=ud // Current frame DI output +//#define npDI_YUV nTEMP0*nGRFWIB + +//--------------------------------------------------------------------------- +// Kernel MRF variables +//--------------------------------------------------------------------------- +#define mMSG_SMPL m1 // Sampler Command is in: m1~m2 +.declare mudMSG_SMPL Base=mMSG_SMPL ElementSize=4 Type=ud +.declare muwMSG_SMPL Base=mMSG_SMPL ElementSize=2 Type=uw + +#define mMSGHDR_DN m1 // Denoise Output: m1~m9 for PA, m3~m5 for PL +.declare mudMSGHDR_DN Base=mMSGHDR_DN ElementSize=4 Type=ud +.declare mubMSGHDR_DN Base=mMSGHDR_DN ElementSize=1 Type=ub + +#define mMSGHDR_STMM m11 // STMM Output: m11~m12 +.declare mudMSGHDR_STMM Base=mMSGHDR_STMM ElementSize=4 Type=ud +#define mMSGHDR_HIST m13 // HIST Output: m13~m14 +.declare mudMSGHDR_HIST Base=mMSGHDR_HIST ElementSize=1 Type=ud + +#define mMSGHDR_DI_1ST m1 // DI output: m1~m5 +.declare mudMSGHDR_DI_1ST Base=mMSGHDR_DI_1ST ElementSize=4 Type=ud +#define mMSGHDR_DI_2ND m6 // DI output: m6~m10 +.declare mudMSGHDR_DI_2ND Base=mMSGHDR_DI_2ND ElementSize=4 Type=ud + +// end of DNDI.inc diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm new file mode 100644 index 0000000..2c041fc --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_COMMAND.asm @@ -0,0 +1,17 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// Activate the DNDI send command +mov (8) mudMSG_SMPL(0)<1> rMSGSRC.0<8;8,1>:ud NODDCLR // message header +mov (1) muwMSG_SMPL(1,4)<1> wORIX<0;1,0>:w NODDCLR_NODDCHK// horizontal origin +mov (1) muwMSG_SMPL(1,12)<1> wORIY<0;1,0>:w NODDCLR_NODDCHK // vertical origin +//mov (2) muwMSG_SMPL(1,4)<2> wORIX<2;2,1>:w NODDCHK// problem during compile !! when using this line + +send (8) udRESP(0)<1> mMSG_SMPL udDUMMY_NULL nSMPL_ENGINE nSMPL_DI_MSGDSC+nSMPL_RESP_LEN+nBI_CURRENT_SRC_YUV_HW_DI:ud diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm new file mode 100644 index 0000000..91c5bc2 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/DNDI_Hist_Save.asm @@ -0,0 +1,20 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + + +// Write denoise history to memory +shr (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w 2:w NODDCLR // X,Y origin / 4 +add (1) rMSGSRC.0<1>:ud rMSGSRC.0<0;1,0>:ud uwSPITCH_DIV2<0;1,0>:uw NODDCLR_NODDCHK// Add pitch to X origin +mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_HIST:ud NODDCHK // block width and height (4x2) + +mov (8) mMSGHDR_HIST<1>:ud rMSGSRC.0<8;8,1>:ud // message header +mov (2) mudMSGHDR_HIST(1)<1> udRESP(nNODI_HIST_OFFSET,0)<2;2,1> // Move denoise history to MRF + +send (8) dNULLREG mMSGHDR_HIST udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_HIST+nBI_STMM_HISTORY_OUTPUT:ud diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm new file mode 100644 index 0000000..55f71b5 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_16x8.asm @@ -0,0 +1,26 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_16x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 YUV packed +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Sample.asm" + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:4:4 internal planar +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Unpack_16x8.asm" + +//------------------------------------------------------------------------------ + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm new file mode 100644 index 0000000..55c201b --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x4.asm @@ -0,0 +1,25 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_8x4.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 YUV packed +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Sample.asm" + +//------------------------------------------------------------------------------ +// Unpacking sampler data to 4:2:0 internal planar +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Unpack_8x4.asm" + +//------------------------------------------------------------------------------ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm new file mode 100644 index 0000000..6bde8c4 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_8x8.asm @@ -0,0 +1,25 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_8x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 YUV packed +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Sample.asm" + +//------------------------------------------------------------------------------ +// Unpacking sampler data to 4:2:2 internal planar +//------------------------------------------------------------------------------ +#include "PA_AVS_IEF_Unpack_8x8.asm" + +//------------------------------------------------------------------------------ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm new file mode 100644 index 0000000..0b533ef --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Sample.asm @@ -0,0 +1,34 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_Sample.asm ---------- + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 YUV packed +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // Enable RGB(YUV) channels + mov (1) rAVS_8x8_HDR.2:ud nAVS_RGB_CHANNELS:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV + // Return YUV in 12 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + mov (16) mAVS_8x8_HDR_2.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2 udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_3CH+nSI_SRC_YUV+nBI_CURRENT_SRC_YUV + // Return YUV in 12 GRFs + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm new file mode 100644 index 0000000..5dcc988 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_16x8.asm @@ -0,0 +1,288 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_Unpack_16x8.asm ---------- + +#ifdef AVS_OUTPUT_16_BIT //Output is packed in AVYU format +// Move first 8x8 words of Y to dest GRF (as packed) + mov (4) uwDEST_Y(0,1)<4> uwAVS_RESPONSE(2,0)<4;4,1> + mov (4) uwDEST_Y(1,1)<4> uwAVS_RESPONSE(2,8)<4;4,1> + mov (4) uwDEST_Y(4,1)<4> uwAVS_RESPONSE(2,4)<4;4,1> + mov (4) uwDEST_Y(5,1)<4> uwAVS_RESPONSE(2,12)<4;4,1> + mov (4) uwDEST_Y(8,1)<4> uwAVS_RESPONSE(3,0)<4;4,1> + mov (4) uwDEST_Y(9,1)<4> uwAVS_RESPONSE(3,8)<4;4,1> + mov (4) uwDEST_Y(12,1)<4> uwAVS_RESPONSE(3,4)<4;4,1> + mov (4) uwDEST_Y(13,1)<4> uwAVS_RESPONSE(3,12)<4;4,1> + mov (4) uwDEST_Y(16,1)<4> uwAVS_RESPONSE(8,0)<4;4,1> + mov (4) uwDEST_Y(17,1)<4> uwAVS_RESPONSE(8,8)<4;4,1> + mov (4) uwDEST_Y(20,1)<4> uwAVS_RESPONSE(8,4)<4;4,1> + mov (4) uwDEST_Y(21,1)<4> uwAVS_RESPONSE(8,12)<4;4,1> + mov (4) uwDEST_Y(24,1)<4> uwAVS_RESPONSE(9,0)<4;4,1> + mov (4) uwDEST_Y(25,1)<4> uwAVS_RESPONSE(9,8)<4;4,1> + mov (4) uwDEST_Y(28,1)<4> uwAVS_RESPONSE(9,4)<4;4,1> + mov (4) uwDEST_Y(29,1)<4> uwAVS_RESPONSE(9,12)<4;4,1> + +// Move first 8x8 words of U to dest GRF (as packed) + mov (4) uwDEST_Y(0,0)<4> uwAVS_RESPONSE(4,0)<4;4,1> + mov (4) uwDEST_Y(1,0)<4> uwAVS_RESPONSE(4,8)<4;4,1> + mov (4) uwDEST_Y(4,0)<4> uwAVS_RESPONSE(4,4)<4;4,1> + mov (4) uwDEST_Y(5,0)<4> uwAVS_RESPONSE(4,12)<4;4,1> + mov (4) uwDEST_Y(8,0)<4> uwAVS_RESPONSE(5,0)<4;4,1> + mov (4) uwDEST_Y(9,0)<4> uwAVS_RESPONSE(5,8)<4;4,1> + mov (4) uwDEST_Y(12,0)<4> uwAVS_RESPONSE(5,4)<4;4,1> + mov (4) uwDEST_Y(13,0)<4> uwAVS_RESPONSE(5,12)<4;4,1> + mov (4) uwDEST_Y(16,0)<4> uwAVS_RESPONSE(10,0)<4;4,1> + mov (4) uwDEST_Y(17,0)<4> uwAVS_RESPONSE(10,8)<4;4,1> + mov (4) uwDEST_Y(20,0)<4> uwAVS_RESPONSE(10,4)<4;4,1> + mov (4) uwDEST_Y(21,0)<4> uwAVS_RESPONSE(10,12)<4;4,1> + mov (4) uwDEST_Y(24,0)<4> uwAVS_RESPONSE(11,0)<4;4,1> + mov (4) uwDEST_Y(25,0)<4> uwAVS_RESPONSE(11,8)<4;4,1> + mov (4) uwDEST_Y(28,0)<4> uwAVS_RESPONSE(11,4)<4;4,1> + mov (4) uwDEST_Y(29,0)<4> uwAVS_RESPONSE(11,12)<4;4,1> + +// Move first 8x8 words of V to dest GRF (as packed) + mov (4) uwDEST_Y(0,2)<4> uwAVS_RESPONSE(0,0)<4;4,1> + mov (4) uwDEST_Y(1,2)<4> uwAVS_RESPONSE(0,8)<4;4,1> + mov (4) uwDEST_Y(4,2)<4> uwAVS_RESPONSE(0,4)<4;4,1> + mov (4) uwDEST_Y(5,2)<4> uwAVS_RESPONSE(0,12)<4;4,1> + mov (4) uwDEST_Y(8,2)<4> uwAVS_RESPONSE(1,0)<4;4,1> + mov (4) uwDEST_Y(9,2)<4> uwAVS_RESPONSE(1,8)<4;4,1> + mov (4) uwDEST_Y(12,2)<4> uwAVS_RESPONSE(1,4)<4;4,1> + mov (4) uwDEST_Y(13,2)<4> uwAVS_RESPONSE(1,12)<4;4,1> + mov (4) uwDEST_Y(16,2)<4> uwAVS_RESPONSE(6,0)<4;4,1> + mov (4) uwDEST_Y(17,2)<4> uwAVS_RESPONSE(6,8)<4;4,1> + mov (4) uwDEST_Y(20,2)<4> uwAVS_RESPONSE(6,4)<4;4,1> + mov (4) uwDEST_Y(21,2)<4> uwAVS_RESPONSE(6,12)<4;4,1> + mov (4) uwDEST_Y(24,2)<4> uwAVS_RESPONSE(7,0)<4;4,1> + mov (4) uwDEST_Y(25,2)<4> uwAVS_RESPONSE(7,8)<4;4,1> + mov (4) uwDEST_Y(28,2)<4> uwAVS_RESPONSE(7,4)<4;4,1> + mov (4) uwDEST_Y(29,2)<4> uwAVS_RESPONSE(7,12)<4;4,1> + +// Move first 8x8 words of A to dest GRF (as packed) + mov (4) uwDEST_Y(0,3)<4> 0:uw + mov (4) uwDEST_Y(1,3)<4> 0:uw + mov (4) uwDEST_Y(4,3)<4> 0:uw + mov (4) uwDEST_Y(5,3)<4> 0:uw + mov (4) uwDEST_Y(8,3)<4> 0:uw + mov (4) uwDEST_Y(9,3)<4> 0:uw + mov (4) uwDEST_Y(12,3)<4> 0:uw + mov (4) uwDEST_Y(13,3)<4> 0:uw + mov (4) uwDEST_Y(16,3)<4> 0:uw + mov (4) uwDEST_Y(17,3)<4> 0:uw + mov (4) uwDEST_Y(20,3)<4> 0:uw + mov (4) uwDEST_Y(21,3)<4> 0:uw + mov (4) uwDEST_Y(24,3)<4> 0:uw + mov (4) uwDEST_Y(25,3)<4> 0:uw + mov (4) uwDEST_Y(28,3)<4> 0:uw + mov (4) uwDEST_Y(29,3)<4> 0:uw + +// Move second 8x8 words of Y to dest GRF + mov (4) uwDEST_Y(2,1)<4> uwAVS_RESPONSE_2(2,0)<4;4,1> + mov (4) uwDEST_Y(3,1)<4> uwAVS_RESPONSE_2(2,8)<4;4,1> + mov (4) uwDEST_Y(6,1)<4> uwAVS_RESPONSE_2(2,4)<4;4,1> + mov (4) uwDEST_Y(7,1)<4> uwAVS_RESPONSE_2(2,12)<4;4,1> + mov (4) uwDEST_Y(10,1)<4> uwAVS_RESPONSE_2(3,0)<4;4,1> + mov (4) uwDEST_Y(11,1)<4> uwAVS_RESPONSE_2(3,8)<4;4,1> + mov (4) uwDEST_Y(14,1)<4> uwAVS_RESPONSE_2(3,4)<4;4,1> + mov (4) uwDEST_Y(15,1)<4> uwAVS_RESPONSE_2(3,12)<4;4,1> + mov (4) uwDEST_Y(18,1)<4> uwAVS_RESPONSE_2(8,0)<4;4,1> + mov (4) uwDEST_Y(19,1)<4> uwAVS_RESPONSE_2(8,8)<4;4,1> + mov (4) uwDEST_Y(22,1)<4> uwAVS_RESPONSE_2(8,4)<4;4,1> + mov (4) uwDEST_Y(23,1)<4> uwAVS_RESPONSE_2(8,12)<4;4,1> + mov (4) uwDEST_Y(26,1)<4> uwAVS_RESPONSE_2(9,0)<4;4,1> + mov (4) uwDEST_Y(27,1)<4> uwAVS_RESPONSE_2(9,8)<4;4,1> + mov (4) uwDEST_Y(30,1)<4> uwAVS_RESPONSE_2(9,4)<4;4,1> + mov (4) uwDEST_Y(31,1)<4> uwAVS_RESPONSE_2(9,12)<4;4,1> + +// Move second 8x8 words of U to dest GRF + mov (4) uwDEST_Y(2,0)<4> uwAVS_RESPONSE_2(4,0)<4;4,1> + mov (4) uwDEST_Y(3,0)<4> uwAVS_RESPONSE_2(4,8)<4;4,1> + mov (4) uwDEST_Y(6,0)<4> uwAVS_RESPONSE_2(4,4)<4;4,1> + mov (4) uwDEST_Y(7,0)<4> uwAVS_RESPONSE_2(4,12)<4;4,1> + mov (4) uwDEST_Y(10,0)<4> uwAVS_RESPONSE_2(5,0)<4;4,1> + mov (4) uwDEST_Y(11,0)<4> uwAVS_RESPONSE_2(5,8)<4;4,1> + mov (4) uwDEST_Y(14,0)<4> uwAVS_RESPONSE_2(5,4)<4;4,1> + mov (4) uwDEST_Y(15,0)<4> uwAVS_RESPONSE_2(5,12)<4;4,1> + mov (4) uwDEST_Y(18,0)<4> uwAVS_RESPONSE_2(10,0)<4;4,1> + mov (4) uwDEST_Y(19,0)<4> uwAVS_RESPONSE_2(10,8)<4;4,1> + mov (4) uwDEST_Y(22,0)<4> uwAVS_RESPONSE_2(10,4)<4;4,1> + mov (4) uwDEST_Y(23,0)<4> uwAVS_RESPONSE_2(10,12)<4;4,1> + mov (4) uwDEST_Y(26,0)<4> uwAVS_RESPONSE_2(11,0)<4;4,1> + mov (4) uwDEST_Y(27,0)<4> uwAVS_RESPONSE_2(11,8)<4;4,1> + mov (4) uwDEST_Y(30,0)<4> uwAVS_RESPONSE_2(11,4)<4;4,1> + mov (4) uwDEST_Y(31,0)<4> uwAVS_RESPONSE_2(11,12)<4;4,1> + +// Move second 8x8 words of V to dest GRF + mov (4) uwDEST_Y(2,2)<4> uwAVS_RESPONSE_2(0,0)<4;4,1> + mov (4) uwDEST_Y(3,2)<4> uwAVS_RESPONSE_2(0,8)<4;4,1> + mov (4) uwDEST_Y(6,2)<4> uwAVS_RESPONSE_2(0,4)<4;4,1> + mov (4) uwDEST_Y(7,2)<4> uwAVS_RESPONSE_2(0,12)<4;4,1> + mov (4) uwDEST_Y(10,2)<4> uwAVS_RESPONSE_2(1,0)<4;4,1> + mov (4) uwDEST_Y(11,2)<4> uwAVS_RESPONSE_2(1,8)<4;4,1> + mov (4) uwDEST_Y(14,2)<4> uwAVS_RESPONSE_2(1,4)<4;4,1> + mov (4) uwDEST_Y(15,2)<4> uwAVS_RESPONSE_2(1,12)<4;4,1> + mov (4) uwDEST_Y(18,2)<4> uwAVS_RESPONSE_2(6,0)<4;4,1> + mov (4) uwDEST_Y(19,2)<4> uwAVS_RESPONSE_2(6,8)<4;4,1> + mov (4) uwDEST_Y(22,2)<4> uwAVS_RESPONSE_2(6,4)<4;4,1> + mov (4) uwDEST_Y(23,2)<4> uwAVS_RESPONSE_2(6,12)<4;4,1> + mov (4) uwDEST_Y(26,2)<4> uwAVS_RESPONSE_2(7,0)<4;4,1> + mov (4) uwDEST_Y(27,2)<4> uwAVS_RESPONSE_2(7,8)<4;4,1> + mov (4) uwDEST_Y(30,2)<4> uwAVS_RESPONSE_2(7,4)<4;4,1> + mov (4) uwDEST_Y(31,2)<4> uwAVS_RESPONSE_2(7,12)<4;4,1> + +// Move second 8x8 words of A to dest GRF + mov (4) uwDEST_Y(2,3)<4> 0:uw + mov (4) uwDEST_Y(3,3)<4> 0:uw + mov (4) uwDEST_Y(6,3)<4> 0:uw + mov (4) uwDEST_Y(7,3)<4> 0:uw + mov (4) uwDEST_Y(10,3)<4> 0:uw + mov (4) uwDEST_Y(11,3)<4> 0:uw + mov (4) uwDEST_Y(14,3)<4> 0:uw + mov (4) uwDEST_Y(15,3)<4> 0:uw + mov (4) uwDEST_Y(18,3)<4> 0:uw + mov (4) uwDEST_Y(19,3)<4> 0:uw + mov (4) uwDEST_Y(22,3)<4> 0:uw + mov (4) uwDEST_Y(23,3)<4> 0:uw + mov (4) uwDEST_Y(26,3)<4> 0:uw + mov (4) uwDEST_Y(27,3)<4> 0:uw + mov (4) uwDEST_Y(30,3)<4> 0:uw + mov (4) uwDEST_Y(31,3)<4> 0:uw + +/* This section will be used if 16-bit output is needed in planar format -vK + // Move first 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0)<1> uwAVS_RESPONSE(2,0)<8;4,1> + mov (8) uwDEST_Y(1)<1> uwAVS_RESPONSE(2,8)<8;4,1> + mov (8) uwDEST_Y(2)<1> uwAVS_RESPONSE(3,0)<8;4,1> + mov (8) uwDEST_Y(3)<1> uwAVS_RESPONSE(3,8)<8;4,1> + mov (8) uwDEST_Y(4)<1> uwAVS_RESPONSE(8,0)<8;4,1> + mov (8) uwDEST_Y(5)<1> uwAVS_RESPONSE(8,8)<8;4,1> + mov (8) uwDEST_Y(6)<1> uwAVS_RESPONSE(9,0)<8;4,1> + mov (8) uwDEST_Y(7)<1> uwAVS_RESPONSE(9,8)<8;4,1> + + // Move first 8x8 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(0,0)<8;4,1> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(0,8)<8;4,1> + mov (8) uwDEST_V(2)<1> ubAVS_RESPONSE(1,0)<8;4,1> + mov (8) uwDEST_V(3)<1> ubAVS_RESPONSE(1,8)<8;4,1> + mov (8) uwDEST_V(4)<1> ubAVS_RESPONSE(6,0)<8;4,1> + mov (8) uwDEST_V(5)<1> ubAVS_RESPONSE(6,8)<8;4,1> + mov (8) uwDEST_V(6)<1> ubAVS_RESPONSE(7,0)<8;4,1> + mov (8) uwDEST_V(7)<1> ubAVS_RESPONSE(7,8)<8;4,1> + + // Move first 8x8 words of U to dest GRF + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,0)<8;4,1> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(4,8)<8;4,1> + mov (8) uwDEST_U(2)<1> ubAVS_RESPONSE(5,0)<8;4,1> + mov (8) uwDEST_U(3)<1> ubAVS_RESPONSE(5,8)<8;4,1> + mov (8) uwDEST_U(4)<1> ubAVS_RESPONSE(10,0)<8;4,1> + mov (8) uwDEST_U(5)<1> ubAVS_RESPONSE(10,8)<8;4,1> + mov (8) uwDEST_U(6)<1> ubAVS_RESPONSE(11,0)<8;4,1> + mov (8) uwDEST_U(7)<1> ubAVS_RESPONSE(11,8)<8;4,1> + + // Move second 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0,8)<1> uwAVS_RESPONSE_2(2,0)<8;4,1> + mov (8) uwDEST_Y(1,8)<1> uwAVS_RESPONSE_2(2,8)<8;4,1> + mov (8) uwDEST_Y(2,8)<1> uwAVS_RESPONSE_2(3,0)<8;4,1> + mov (8) uwDEST_Y(3,8)<1> uwAVS_RESPONSE_2(3,8)<8;4,1> + mov (8) uwDEST_Y(4,8)<1> uwAVS_RESPONSE_2(8,0)<8;4,1> + mov (8) uwDEST_Y(5,8)<1> uwAVS_RESPONSE_2(8,8)<8;4,1> + mov (8) uwDEST_Y(6,8)<1> uwAVS_RESPONSE_2(9,0)<8;4,1> + mov (8) uwDEST_Y(7,8)<1> uwAVS_RESPONSE_2(9,8)<8;4,1> + + // Move second 8x8 words of V to dest GRF + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE_2(0,0)<8;4,1> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE_2(0,8)<8;4,1> + mov (8) uwDEST_V(2,8)<1> ubAVS_RESPONSE_2(1,0)<8;4,1> + mov (8) uwDEST_V(3,8)<1> ubAVS_RESPONSE_2(1,8)<8;4,1> + mov (8) uwDEST_V(4,8)<1> ubAVS_RESPONSE_2(6,0)<8;4,1> + mov (8) uwDEST_V(5,8)<1> ubAVS_RESPONSE_2(6,8)<8;4,1> + mov (8) uwDEST_V(6,8)<1> ubAVS_RESPONSE_2(7,0)<8;4,1> + mov (8) uwDEST_V(7,8)<1> ubAVS_RESPONSE_2(7,8)<8;4,1> + + // Move second 8x8 words of U to dest GRF + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE_2(4,0)<8;4,1> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE_2(4,8)<8;4,1> + mov (8) uwDEST_U(2,8)<1> ubAVS_RESPONSE_2(5,0)<8;4,1> + mov (8) uwDEST_U(3,8)<1> ubAVS_RESPONSE_2(5,8)<8;4,1> + mov (8) uwDEST_U(4,8)<1> ubAVS_RESPONSE_2(10,0)<8;4,1> + mov (8) uwDEST_U(5,8)<1> ubAVS_RESPONSE_2(10,8)<8;4,1> + mov (8) uwDEST_U(6,8)<1> ubAVS_RESPONSE_2(11,0)<8;4,1> + mov (8) uwDEST_U(7,8)<1> ubAVS_RESPONSE_2(11,8)<8;4,1> +*/ +#else /* OUTPUT_8_BIT */ + // Move first 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0)<1> ubAVS_RESPONSE(2,1)<16;4,2> + mov (8) uwDEST_Y(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2)<1> ubAVS_RESPONSE(3,1)<16;4,2> + mov (8) uwDEST_Y(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_Y(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_Y(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + + // Move first 8x8 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(0,1)<16;4,2> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(0,8+1)<16;4,2> + mov (8) uwDEST_V(2)<1> ubAVS_RESPONSE(1,1)<16;4,2> + mov (8) uwDEST_V(3)<1> ubAVS_RESPONSE(1,8+1)<16;4,2> + mov (8) uwDEST_V(4)<1> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) uwDEST_V(5)<1> ubAVS_RESPONSE(6,8+1)<16;4,2> + mov (8) uwDEST_V(6)<1> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) uwDEST_V(7)<1> ubAVS_RESPONSE(7,8+1)<16;4,2> + + // Move first 8x8 words of U to dest GRF + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(4,8+1)<16;4,2> + mov (8) uwDEST_U(2)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) uwDEST_U(3)<1> ubAVS_RESPONSE(5,8+1)<16;4,2> + mov (8) uwDEST_U(4)<1> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) uwDEST_U(5)<1> ubAVS_RESPONSE(10,8+1)<16;4,2> + mov (8) uwDEST_U(6)<1> ubAVS_RESPONSE(11,1)<16;4,2> + mov (8) uwDEST_U(7)<1> ubAVS_RESPONSE(11,8+1)<16;4,2> + + // Move second 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2> + mov (8) uwDEST_Y(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2> + mov (8) uwDEST_Y(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) uwDEST_Y(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) uwDEST_Y(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + + // Move second 8x8 words of V to dest GRF + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE_2(0,1)<16;4,2> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE_2(0,8+1)<16;4,2> + mov (8) uwDEST_V(2,8)<1> ubAVS_RESPONSE_2(1,1)<16;4,2> + mov (8) uwDEST_V(3,8)<1> ubAVS_RESPONSE_2(1,8+1)<16;4,2> + mov (8) uwDEST_V(4,8)<1> ubAVS_RESPONSE_2(6,1)<16;4,2> + mov (8) uwDEST_V(5,8)<1> ubAVS_RESPONSE_2(6,8+1)<16;4,2> + mov (8) uwDEST_V(6,8)<1> ubAVS_RESPONSE_2(7,1)<16;4,2> + mov (8) uwDEST_V(7,8)<1> ubAVS_RESPONSE_2(7,8+1)<16;4,2> + + // Move second 8x8 words of U to dest GRF + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE_2(4,1)<16;4,2> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE_2(4,8+1)<16;4,2> + mov (8) uwDEST_U(2,8)<1> ubAVS_RESPONSE_2(5,1)<16;4,2> + mov (8) uwDEST_U(3,8)<1> ubAVS_RESPONSE_2(5,8+1)<16;4,2> + mov (8) uwDEST_U(4,8)<1> ubAVS_RESPONSE_2(10,1)<16;4,2> + mov (8) uwDEST_U(5,8)<1> ubAVS_RESPONSE_2(10,8+1)<16;4,2> + mov (8) uwDEST_U(6,8)<1> ubAVS_RESPONSE_2(11,1)<16;4,2> + mov (8) uwDEST_U(7,8)<1> ubAVS_RESPONSE_2(11,8+1)<16;4,2> +#endif +//------------------------------------------------------------------------------ + + // Re-define new number of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm new file mode 100644 index 0000000..01d451d --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x4.asm @@ -0,0 +1,77 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_Unpack_8x8.asm ---------- + +// Yoni: In order to optimize unpacking, 3 methods are being checked: +// 1. AVS_ORIGINAL +// 2. AVS_ROUND_TO_8_BITS +// 3. AVS_INDIRECT_ACCESS +// +// Only 1 method should stay in the code + + +//#define AVS_ROUND_TO_8_BITS +//#define AVS_INDIRECT_ACCESS + + + // Move first 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0)<1> ubAVS_RESPONSE(2,1)<16;4,2> + mov (8) uwDEST_Y(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2)<1> ubAVS_RESPONSE(3,1)<16;4,2> + mov (8) uwDEST_Y(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_Y(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_Y(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + + // Move first 4x8 words of V to dest GRF + mov (4) uwDEST_V(0)<1> ubAVS_RESPONSE(0,1)<16;2,4> + mov (4) uwDEST_V(0,8)<1> ubAVS_RESPONSE(1,1)<16;2,4> + mov (4) uwDEST_V(1)<1> ubAVS_RESPONSE(6,1)<16;2,4> + mov (4) uwDEST_V(1,8)<1> ubAVS_RESPONSE(7,1)<16;2,4> + + // Move first 4x8 words of U to dest GRF + mov (4) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;2,4> + mov (4) uwDEST_U(0,8)<1> ubAVS_RESPONSE(5,1)<16;2,4> + mov (4) uwDEST_U(1)<1> ubAVS_RESPONSE(10,1)<16;2,4> + mov (4) uwDEST_U(1,8)<1> ubAVS_RESPONSE(11,1)<16;2,4> + + // Move second 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2> + mov (8) uwDEST_Y(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2> + mov (8) uwDEST_Y(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) uwDEST_Y(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) uwDEST_Y(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + + // Move second 4x8 words of V to dest GRF + mov (4) uwDEST_V(0,4)<1> ubAVS_RESPONSE_2(0,1)<16;2,4> + mov (4) uwDEST_V(0,12)<1> ubAVS_RESPONSE_2(1,1)<16;2,4> + mov (4) uwDEST_V(1,4)<1> ubAVS_RESPONSE_2(6,1)<16;2,4> + mov (4) uwDEST_V(1,12)<1> ubAVS_RESPONSE_2(7,1)<16;2,4> + + // Move second 4x8 words of U to dest GRF + mov (4) uwDEST_U(0,4)<1> ubAVS_RESPONSE_2(4,1)<16;2,4> + mov (4) uwDEST_U(0,12)<1> ubAVS_RESPONSE_2(5,1)<16;2,4> + mov (4) uwDEST_U(1,4)<1> ubAVS_RESPONSE_2(10,1)<16;2,4> + mov (4) uwDEST_U(1,12)<1> ubAVS_RESPONSE_2(11,1)<16;2,4> + +//------------------------------------------------------------------------------ + + // Re-define new number of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm new file mode 100644 index 0000000..91b2398 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_AVS_IEF_Unpack_8x8.asm @@ -0,0 +1,93 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_AVS_IEF_Unpack_8x8.asm ---------- + +// Yoni: In order to optimize unpacking, 3 methods are being checked: +// 1. AVS_ORIGINAL +// 2. AVS_ROUND_TO_8_BITS +// 3. AVS_INDIRECT_ACCESS +// +// Only 1 method should stay in the code + + +//#define AVS_ROUND_TO_8_BITS +//#define AVS_INDIRECT_ACCESS + + + // Move first 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0)<1> ubAVS_RESPONSE(2,1)<16;4,2> + mov (8) uwDEST_Y(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2)<1> ubAVS_RESPONSE(3,1)<16;4,2> + mov (8) uwDEST_Y(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_Y(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_Y(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + + // Move first 4x8 words of V to dest GRF + mov (4) uwDEST_V(0)<1> ubAVS_RESPONSE(0,1)<16;2,4> + mov (4) uwDEST_V(0,8)<1> ubAVS_RESPONSE(0,8+1)<16;2,4> + mov (4) uwDEST_V(1)<1> ubAVS_RESPONSE(1,1)<16;2,4> + mov (4) uwDEST_V(1,8)<1> ubAVS_RESPONSE(1,8+1)<16;2,4> + mov (4) uwDEST_V(2)<1> ubAVS_RESPONSE(6,1)<16;2,4> + mov (4) uwDEST_V(2,8)<1> ubAVS_RESPONSE(6,8+1)<16;2,4> + mov (4) uwDEST_V(3)<1> ubAVS_RESPONSE(7,1)<16;2,4> + mov (4) uwDEST_V(3,8)<1> ubAVS_RESPONSE(7,8+1)<16;2,4> + + // Move first 4x8 words of U to dest GRF + mov (4) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;2,4> + mov (4) uwDEST_U(0,8)<1> ubAVS_RESPONSE(4,8+1)<16;2,4> + mov (4) uwDEST_U(1)<1> ubAVS_RESPONSE(5,1)<16;2,4> + mov (4) uwDEST_U(1,8)<1> ubAVS_RESPONSE(5,8+1)<16;2,4> + mov (4) uwDEST_U(2)<1> ubAVS_RESPONSE(10,1)<16;2,4> + mov (4) uwDEST_U(2,8)<1> ubAVS_RESPONSE(10,8+1)<16;2,4> + mov (4) uwDEST_U(3)<1> ubAVS_RESPONSE(11,1)<16;2,4> + mov (4) uwDEST_U(3,8)<1> ubAVS_RESPONSE(11,8+1)<16;2,4> + + // Move second 8x8 words of Y to dest GRF + mov (8) uwDEST_Y(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2> + mov (8) uwDEST_Y(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2> + mov (8) uwDEST_Y(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2> + mov (8) uwDEST_Y(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2> + mov (8) uwDEST_Y(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) uwDEST_Y(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) uwDEST_Y(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) uwDEST_Y(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + + // Move second 4x8 words of V to dest GRF + mov (4) uwDEST_V(0,4)<1> ubAVS_RESPONSE_2(0,1)<16;2,4> + mov (4) uwDEST_V(0,12)<1> ubAVS_RESPONSE_2(0,8+1)<16;2,4> + mov (4) uwDEST_V(1,4)<1> ubAVS_RESPONSE_2(1,1)<16;2,4> + mov (4) uwDEST_V(1,12)<1> ubAVS_RESPONSE_2(1,8+1)<16;2,4> + mov (4) uwDEST_V(2,4)<1> ubAVS_RESPONSE_2(6,1)<16;2,4> + mov (4) uwDEST_V(2,12)<1> ubAVS_RESPONSE_2(6,8+1)<16;2,4> + mov (4) uwDEST_V(3,4)<1> ubAVS_RESPONSE_2(7,1)<16;2,4> + mov (4) uwDEST_V(3,12)<1> ubAVS_RESPONSE_2(7,8+1)<16;2,4> + + // Move second 4x8 words of U to dest GRF + mov (4) uwDEST_U(0,4)<1> ubAVS_RESPONSE_2(4,1)<16;2,4> + mov (4) uwDEST_U(0,12)<1> ubAVS_RESPONSE_2(4,8+1)<16;2,4> + mov (4) uwDEST_U(1,4)<1> ubAVS_RESPONSE_2(5,1)<16;2,4> + mov (4) uwDEST_U(1,12)<1> ubAVS_RESPONSE_2(5,8+1)<16;2,4> + mov (4) uwDEST_U(2,4)<1> ubAVS_RESPONSE_2(10,1)<16;2,4> + mov (4) uwDEST_U(2,12)<1> ubAVS_RESPONSE_2(10,8+1)<16;2,4> + mov (4) uwDEST_U(3,4)<1> ubAVS_RESPONSE_2(11,1)<16;2,4> + mov (4) uwDEST_U(3,12)<1> ubAVS_RESPONSE_2(11,8+1)<16;2,4> + +//------------------------------------------------------------------------------ + + // Re-define new number of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm new file mode 100644 index 0000000..6aa91c8 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DNDI_ALG.asm @@ -0,0 +1,139 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #ifdef DI_ONLY + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DI // set the number of GRF + #else + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #endif + + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_4 // DN Block Size for Write is 32x4 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_Command.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + //// move the previous frame Y component to internal planar format + //$for (0; <nY_NUM_OF_ROWS/2; 1) { + // mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + //} + //// move the previous frame U,V components to internal planar format + //$for (0; <nUV_NUM_OF_ROWS/2; 1) { + // mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + // mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + //} + //// move the current frame Y component to internal planar format + //$for (0; <nY_NUM_OF_ROWS/2; 1) { + // mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + //} + //// move the current frame U,V components to internal planar format + //$for (0; <nUV_NUM_OF_ROWS/2; 1) { + // mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + // mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + //} + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud NODDCHK // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// +#ifdef DI_ONLY +#else + + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run /////////////// + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:uw + //set the save DN position + shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR // X origin * 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud NODDCHK // block width and height (8x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + + + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + // mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2) + // mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3) + // mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2) + // mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2) + // mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3) + // mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3) + //} + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 0,2) + mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,16) // 1st field luma from current frame (line 1,3) + } + + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 0,2) + mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16+1)<16;8,2> // 1st field U from current frame (line 1,3) + } + + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 0,2) + mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,16)<16;8,2> // 1st field U from current frame (line 1,3) + } + + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + //$for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + // mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2) + // mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3) + // mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2) + // mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2) + // mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3) + // mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3) + //} + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0) // 1st field luma from current frame (line 0,2) + mov (16) r[pCF_Y_OFFSET, %1+1*32]<2>:ub ubRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*16) // 2nd field luma from current frame (line 1,3) + } + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,1)<16;8,2> // 1st field U from current frame (line 0,2) + mov (8) r[pCF_U_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16+1)<16;8,2> // 2nd field U from current frame (line 1,3) + } + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET+%2,0)<16;8,2> // 1st field V from current frame (line 0,2) + mov (8) r[pCF_V_OFFSET, %1+1*32]<4>:ub ubRESP(nDI_CURR_2ND_FIELD_CHROMA_OFFSET,%2*16)<16;8,2> // 2nd field V from current frame (line 1,3) + } + +SAVE_DN_CURR: + $for(0; <nY_NUM_OF_ROWS/2; 1) { + mov (8) mudMSGHDR_DN(%1+1)<1> udDN_YUV(%1)REGION(8,1) + } + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_DI+nBI_DESTINATION_YUV:ud +#endif + +// Save Processed frames +#include "DI_Save_PA.asm" + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm new file mode 100644 index 0000000..ef88a3c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_DN_ALG.asm @@ -0,0 +1,54 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_DISABLE + +#include "DNDI.inc" + +#undef nY_NUM_OF_ROWS +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block +#undef nUV_NUM_OF_ROWS +#define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + +#undef nSMPL_RESP_LEN +#define nSMPL_RESP_LEN nSMPL_RESP_LEN_DN_PA // Set the Number of GRFs in DNDI response +#undef nDPW_BLOCK_SIZE_DN +#define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // DN Curr Block Size for Write is 32x8 +#undef nDPW_BLOCK_SIZE_HIST +#define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2 // HIST Block Size for Write is 4x2 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// +#include "DNDI_COMMAND.asm" + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// +#include "DNDI_Hist_Save.asm" + +////////////////////////////////////// Pack and Save the DN Curr Frame for Next Run /////////////// +add (4) pCF_Y_OFFSET<1>:uw ubDEST_CF_OFFSET<4;4,1>:ub npDN_YUV:w +$for (0; <nY_NUM_OF_ROWS; 1) { + mov (16) r[pCF_Y_OFFSET, %1*32]<2>:ub ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1> // copy line of Y +} +$for (0; <nUV_NUM_OF_ROWS; 1) { + mov (8) r[pCF_U_OFFSET, %1*32]<4>:ub ubRESP(nNODI_CHROMA_OFFSET,%1*16+1)<16;8,2> // copy line of U + mov (8) r[pCF_V_OFFSET, %1*32]<4>:ub ubRESP(nNODI_CHROMA_OFFSET,%1*16)<16;8,2> // copy line of V +} + +shl (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin * 2 (422 output) +mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin +mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (32x8) +mov (8) mMSGHDR_DN<1>:ud rMSGSRC<8;8,1>:ud // message header + +$for(0; <nY_NUM_OF_ROWS; 2) { + mov (16) mudMSGHDR_DN(1+%1)<1> udDN_YUV(%1)REGION(8,1) // Move DN Curr to MRF +} +send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PA_DN_NODI+nBI_DESTINATION_YUV:ud + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm new file mode 100644 index 0000000..c2a1b1e --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PA_Scaling.asm @@ -0,0 +1,70 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PA_Scaling.asm ---------- +#include "Scaling.inc" + + // Build 16 elements ramp in float32 and normalized it +// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v +// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f +mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf //3, 2, 1, 0 in float vector +mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf //7, 6, 5, 4 in float vector +add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f + +//Module: PrepareScaleCoord.asm + + // Setup for sampler msg hdr + mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields + mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset + + // Calculate 16 v based on the step Y and vertical origin + mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f + mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f + + // Calculate 16 u based on the step X and hori origin +// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly + mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr } + mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr } + + //Setup the constants for line instruction + mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk } + mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk } + +//------------------------------------------------------------------------------ + +$for (0; <nY_NUM_OF_ROWS; 1) { + + // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA). + mov (8) MSGHDR_SCALE.0:ud rMSGSRC.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_YUV+nBI_CURRENT_SRC_YUV + + // Calculate 16 v for next line + add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + + // Scale back to [0, 255], convert f to ud + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(2) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(2)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(4) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(4)<1> acc0:f { Compr } + + mov (16) DEST_V(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK + mov (16) DEST_Y(%1)<1> SCALE_RESPONSE_YB(2) //possible error due to truncation - vK + mov (16) DEST_U(%1)<1> SCALE_RESPONSE_YB(4) //possible error due to truncation - vK + +} + + #define nSRC_REGION nREGION_1 + +//------------------------------------------------------------------------------ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm new file mode 100644 index 0000000..2f7b735 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_16x8.asm @@ -0,0 +1,60 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_16x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y each +// 2 sampler read for 8x8 U and 8x8 V (NV11\P208 input surface) +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 8x8 U and V sampling + // Enable red and blue channels + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud + + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV + // Return U and V in 8 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + + // 2nd 8x8 U and V sampling + // Enable red and blue channels + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud + + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV + // Return U and V in 8 GRFs + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:4:4 internal planar +//------------------------------------------------------------------------------ + #include "PL2_AVS_IEF_Unpack_16x8.asm" + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm new file mode 100644 index 0000000..9b221e7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x4.asm @@ -0,0 +1,58 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_8x4.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y each +// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface) +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 8x8 U and V sampling + // Enable red and blue channels + //Only 8x4 wil be used + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud + + // Calculate Chroma Step Size: + // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X + // for V direction: 8 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y + mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Step X for chroma + + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV + // Return U and V in 8 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:2:0 internal planar +//------------------------------------------------------------------------------ + #include "PL2_AVS_IEF_Unpack_8x4.asm" + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm new file mode 100644 index 0000000..404fbd0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_8x8.asm @@ -0,0 +1,57 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_8x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y each +// 1 sampler read for 8x8 U and 8x8 V (NV11\NV12 input surface) +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 8x8 U and V sampling + // Enable red and blue channels + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_BLUE_CHANNELS:ud + + // Calculate Chroma Step Size: + // for H direction: 16 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_X = 2 * Luma_Step_X + // for V direction: 8 Luma samples are covered by 8 Chroma samples. Thus Chroma_Step_Y = Luma_Step_Y + mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Step X for chroma + + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_2CH+nSI_SRC_UV+nBI_CURRENT_SRC_UV + // Return U and V in 8 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + // Enable green channel only + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud + + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:2:2 internal planar +//------------------------------------------------------------------------------ + #include "PL2_AVS_IEF_Unpack_8x8.asm" + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm new file mode 100644 index 0000000..6c994c1 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_16x8.asm @@ -0,0 +1,271 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_Unpack_16x8.asm ---------- + +#ifdef AVS_OUTPUT_16_BIT //Output is packed in AVYU format +// Move first 8x8 words of Y to dest GRF (as packed) + mov (4) uwDEST_Y(0,1)<4> uwAVS_RESPONSE(0,0)<4;4,1> + mov (4) uwDEST_Y(1,1)<4> uwAVS_RESPONSE(0,8)<4;4,1> + mov (4) uwDEST_Y(4,1)<4> uwAVS_RESPONSE(0,4)<4;4,1> + mov (4) uwDEST_Y(5,1)<4> uwAVS_RESPONSE(0,12)<4;4,1> + mov (4) uwDEST_Y(8,1)<4> uwAVS_RESPONSE(1,0)<4;4,1> + mov (4) uwDEST_Y(9,1)<4> uwAVS_RESPONSE(1,8)<4;4,1> + mov (4) uwDEST_Y(12,1)<4> uwAVS_RESPONSE(1,4)<4;4,1> + mov (4) uwDEST_Y(13,1)<4> uwAVS_RESPONSE(1,12)<4;4,1> + mov (4) uwDEST_Y(16,1)<4> uwAVS_RESPONSE(2,0)<4;4,1> + mov (4) uwDEST_Y(17,1)<4> uwAVS_RESPONSE(2,8)<4;4,1> + mov (4) uwDEST_Y(20,1)<4> uwAVS_RESPONSE(2,4)<4;4,1> + mov (4) uwDEST_Y(21,1)<4> uwAVS_RESPONSE(2,12)<4;4,1> + mov (4) uwDEST_Y(24,1)<4> uwAVS_RESPONSE(3,0)<4;4,1> + mov (4) uwDEST_Y(25,1)<4> uwAVS_RESPONSE(3,8)<4;4,1> + mov (4) uwDEST_Y(28,1)<4> uwAVS_RESPONSE(3,4)<4;4,1> + mov (4) uwDEST_Y(29,1)<4> uwAVS_RESPONSE(3,12)<4;4,1> + +// Move first 8x8 words of U to dest GRF (as packed) + mov (4) uwDEST_Y(0,0)<4> uwAVS_RESPONSE(4,0)<4;4,1> + mov (4) uwDEST_Y(1,0)<4> uwAVS_RESPONSE(4,8)<4;4,1> + mov (4) uwDEST_Y(4,0)<4> uwAVS_RESPONSE(4,4)<4;4,1> + mov (4) uwDEST_Y(5,0)<4> uwAVS_RESPONSE(4,12)<4;4,1> + mov (4) uwDEST_Y(8,0)<4> uwAVS_RESPONSE(5,0)<4;4,1> + mov (4) uwDEST_Y(9,0)<4> uwAVS_RESPONSE(5,8)<4;4,1> + mov (4) uwDEST_Y(12,0)<4> uwAVS_RESPONSE(5,4)<4;4,1> + mov (4) uwDEST_Y(13,0)<4> uwAVS_RESPONSE(5,12)<4;4,1> + mov (4) uwDEST_Y(16,0)<4> uwAVS_RESPONSE(8,0)<4;4,1> + mov (4) uwDEST_Y(17,0)<4> uwAVS_RESPONSE(8,8)<4;4,1> + mov (4) uwDEST_Y(20,0)<4> uwAVS_RESPONSE(8,4)<4;4,1> + mov (4) uwDEST_Y(21,0)<4> uwAVS_RESPONSE(8,12)<4;4,1> + mov (4) uwDEST_Y(24,0)<4> uwAVS_RESPONSE(9,0)<4;4,1> + mov (4) uwDEST_Y(25,0)<4> uwAVS_RESPONSE(9,8)<4;4,1> + mov (4) uwDEST_Y(28,0)<4> uwAVS_RESPONSE(9,4)<4;4,1> + mov (4) uwDEST_Y(29,0)<4> uwAVS_RESPONSE(9,12)<4;4,1> + +// Move first 8x8 words of V to dest GRF (as packed) + mov (4) uwDEST_Y(0,2)<4> uwAVS_RESPONSE(6,0)<4;4,1> + mov (4) uwDEST_Y(1,2)<4> uwAVS_RESPONSE(6,8)<4;4,1> + mov (4) uwDEST_Y(4,2)<4> uwAVS_RESPONSE(6,4)<4;4,1> + mov (4) uwDEST_Y(5,2)<4> uwAVS_RESPONSE(6,12)<4;4,1> + mov (4) uwDEST_Y(8,2)<4> uwAVS_RESPONSE(7,0)<4;4,1> + mov (4) uwDEST_Y(9,2)<4> uwAVS_RESPONSE(7,8)<4;4,1> + mov (4) uwDEST_Y(12,2)<4> uwAVS_RESPONSE(7,4)<4;4,1> + mov (4) uwDEST_Y(13,2)<4> uwAVS_RESPONSE(7,12)<4;4,1> + mov (4) uwDEST_Y(16,2)<4> uwAVS_RESPONSE(10,0)<4;4,1> + mov (4) uwDEST_Y(17,2)<4> uwAVS_RESPONSE(10,8)<4;4,1> + mov (4) uwDEST_Y(20,2)<4> uwAVS_RESPONSE(10,4)<4;4,1> + mov (4) uwDEST_Y(21,2)<4> uwAVS_RESPONSE(10,12)<4;4,1> + mov (4) uwDEST_Y(24,2)<4> uwAVS_RESPONSE(11,0)<4;4,1> + mov (4) uwDEST_Y(25,2)<4> uwAVS_RESPONSE(11,8)<4;4,1> + mov (4) uwDEST_Y(28,2)<4> uwAVS_RESPONSE(11,4)<4;4,1> + mov (4) uwDEST_Y(29,2)<4> uwAVS_RESPONSE(11,12)<4;4,1> + +// Move first 8x8 words of A to dest GRF (as packed) + mov (4) uwDEST_Y(0,3)<4> 0:uw + mov (4) uwDEST_Y(1,3)<4> 0:uw + mov (4) uwDEST_Y(4,3)<4> 0:uw + mov (4) uwDEST_Y(5,3)<4> 0:uw + mov (4) uwDEST_Y(8,3)<4> 0:uw + mov (4) uwDEST_Y(9,3)<4> 0:uw + mov (4) uwDEST_Y(12,3)<4> 0:uw + mov (4) uwDEST_Y(13,3)<4> 0:uw + mov (4) uwDEST_Y(16,3)<4> 0:uw + mov (4) uwDEST_Y(17,3)<4> 0:uw + mov (4) uwDEST_Y(20,3)<4> 0:uw + mov (4) uwDEST_Y(21,3)<4> 0:uw + mov (4) uwDEST_Y(24,3)<4> 0:uw + mov (4) uwDEST_Y(25,3)<4> 0:uw + mov (4) uwDEST_Y(28,3)<4> 0:uw + mov (4) uwDEST_Y(29,3)<4> 0:uw + +// Move second 8x8 words of Y to dest GRF + mov (4) uwDEST_Y(2,1)<4> uwAVS_RESPONSE_2(0,0)<4;4,1> + mov (4) uwDEST_Y(3,1)<4> uwAVS_RESPONSE_2(0,8)<4;4,1> + mov (4) uwDEST_Y(6,1)<4> uwAVS_RESPONSE_2(0,4)<4;4,1> + mov (4) uwDEST_Y(7,1)<4> uwAVS_RESPONSE_2(0,12)<4;4,1> + mov (4) uwDEST_Y(10,1)<4> uwAVS_RESPONSE_2(1,0)<4;4,1> + mov (4) uwDEST_Y(11,1)<4> uwAVS_RESPONSE_2(1,8)<4;4,1> + mov (4) uwDEST_Y(14,1)<4> uwAVS_RESPONSE_2(1,4)<4;4,1> + mov (4) uwDEST_Y(15,1)<4> uwAVS_RESPONSE_2(1,12)<4;4,1> + mov (4) uwDEST_Y(18,1)<4> uwAVS_RESPONSE_2(2,0)<4;4,1> + mov (4) uwDEST_Y(19,1)<4> uwAVS_RESPONSE_2(2,8)<4;4,1> + mov (4) uwDEST_Y(22,1)<4> uwAVS_RESPONSE_2(2,4)<4;4,1> + mov (4) uwDEST_Y(23,1)<4> uwAVS_RESPONSE_2(2,12)<4;4,1> + mov (4) uwDEST_Y(26,1)<4> uwAVS_RESPONSE_2(3,0)<4;4,1> + mov (4) uwDEST_Y(27,1)<4> uwAVS_RESPONSE_2(3,8)<4;4,1> + mov (4) uwDEST_Y(30,1)<4> uwAVS_RESPONSE_2(3,4)<4;4,1> + mov (4) uwDEST_Y(31,1)<4> uwAVS_RESPONSE_2(3,12)<4;4,1> + +// Move second 8x8 words of U to dest GRF + mov (4) uwDEST_Y(2,0)<4> uwAVS_RESPONSE_2(4,0)<4;4,1> + mov (4) uwDEST_Y(3,0)<4> uwAVS_RESPONSE_2(4,8)<4;4,1> + mov (4) uwDEST_Y(6,0)<4> uwAVS_RESPONSE_2(4,4)<4;4,1> + mov (4) uwDEST_Y(7,0)<4> uwAVS_RESPONSE_2(4,12)<4;4,1> + mov (4) uwDEST_Y(10,0)<4> uwAVS_RESPONSE_2(5,0)<4;4,1> + mov (4) uwDEST_Y(11,0)<4> uwAVS_RESPONSE_2(5,8)<4;4,1> + mov (4) uwDEST_Y(14,0)<4> uwAVS_RESPONSE_2(5,4)<4;4,1> + mov (4) uwDEST_Y(15,0)<4> uwAVS_RESPONSE_2(5,12)<4;4,1> + mov (4) uwDEST_Y(18,0)<4> uwAVS_RESPONSE_2(8,0)<4;4,1> + mov (4) uwDEST_Y(19,0)<4> uwAVS_RESPONSE_2(8,8)<4;4,1> + mov (4) uwDEST_Y(22,0)<4> uwAVS_RESPONSE_2(8,4)<4;4,1> + mov (4) uwDEST_Y(23,0)<4> uwAVS_RESPONSE_2(8,12)<4;4,1> + mov (4) uwDEST_Y(26,0)<4> uwAVS_RESPONSE_2(9,0)<4;4,1> + mov (4) uwDEST_Y(27,0)<4> uwAVS_RESPONSE_2(9,8)<4;4,1> + mov (4) uwDEST_Y(30,0)<4> uwAVS_RESPONSE_2(9,4)<4;4,1> + mov (4) uwDEST_Y(31,0)<4> uwAVS_RESPONSE_2(9,12)<4;4,1> + +// Move second 8x8 words of V to dest GRF + mov (4) uwDEST_Y(2,2)<4> uwAVS_RESPONSE_2(6,0)<4;4,1> + mov (4) uwDEST_Y(3,2)<4> uwAVS_RESPONSE_2(6,8)<4;4,1> + mov (4) uwDEST_Y(6,2)<4> uwAVS_RESPONSE_2(6,4)<4;4,1> + mov (4) uwDEST_Y(7,2)<4> uwAVS_RESPONSE_2(6,12)<4;4,1> + mov (4) uwDEST_Y(10,2)<4> uwAVS_RESPONSE_2(7,0)<4;4,1> + mov (4) uwDEST_Y(11,2)<4> uwAVS_RESPONSE_2(7,8)<4;4,1> + mov (4) uwDEST_Y(14,2)<4> uwAVS_RESPONSE_2(7,4)<4;4,1> + mov (4) uwDEST_Y(15,2)<4> uwAVS_RESPONSE_2(7,12)<4;4,1> + mov (4) uwDEST_Y(18,2)<4> uwAVS_RESPONSE_2(10,0)<4;4,1> + mov (4) uwDEST_Y(19,2)<4> uwAVS_RESPONSE_2(10,8)<4;4,1> + mov (4) uwDEST_Y(22,2)<4> uwAVS_RESPONSE_2(10,4)<4;4,1> + mov (4) uwDEST_Y(23,2)<4> uwAVS_RESPONSE_2(10,12)<4;4,1> + mov (4) uwDEST_Y(26,2)<4> uwAVS_RESPONSE_2(11,0)<4;4,1> + mov (4) uwDEST_Y(27,2)<4> uwAVS_RESPONSE_2(11,8)<4;4,1> + mov (4) uwDEST_Y(30,2)<4> uwAVS_RESPONSE_2(11,4)<4;4,1> + mov (4) uwDEST_Y(31,2)<4> uwAVS_RESPONSE_2(11,12)<4;4,1> + +// Move second 8x8 words of A to dest GRF + mov (4) uwDEST_Y(2,3)<4> 0:uw + mov (4) uwDEST_Y(3,3)<4> 0:uw + mov (4) uwDEST_Y(6,3)<4> 0:uw + mov (4) uwDEST_Y(7,3)<4> 0:uw + mov (4) uwDEST_Y(10,3)<4> 0:uw + mov (4) uwDEST_Y(11,3)<4> 0:uw + mov (4) uwDEST_Y(14,3)<4> 0:uw + mov (4) uwDEST_Y(15,3)<4> 0:uw + mov (4) uwDEST_Y(18,3)<4> 0:uw + mov (4) uwDEST_Y(19,3)<4> 0:uw + mov (4) uwDEST_Y(22,3)<4> 0:uw + mov (4) uwDEST_Y(23,3)<4> 0:uw + mov (4) uwDEST_Y(26,3)<4> 0:uw + mov (4) uwDEST_Y(27,3)<4> 0:uw + mov (4) uwDEST_Y(30,3)<4> 0:uw + mov (4) uwDEST_Y(31,3)<4> 0:uw + +/* This section will be used if 16-bit output is needed in planar format -vK + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> uwAVS_RESPONSE(%1,0)<8;4,1> + mov (8) uwDEST_Y(%1*2+1)<1> uwAVS_RESPONSE(%1,8)<8;4,1> + } + + // Move 1st 8x8 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0)<1> uwAVS_RESPONSE(4,0)<8;4,1> + mov (8) uwDEST_U(1)<1> uwAVS_RESPONSE(4,8)<8;4,1> + mov (8) uwDEST_U(2)<1> uwAVS_RESPONSE(5,0)<8;4,1> + mov (8) uwDEST_U(3)<1> uwAVS_RESPONSE(5,8)<8;4,1> + mov (8) uwDEST_U(4)<1> uwAVS_RESPONSE(8,0)<8;4,1> + mov (8) uwDEST_U(5)<1> uwAVS_RESPONSE(8,8)<8;4,1> + mov (8) uwDEST_U(6)<1> uwAVS_RESPONSE(9,0)<8;4,1> + mov (8) uwDEST_U(7)<1> uwAVS_RESPONSE(9,8)<8;4,1> + + // Move 1st 8x8 words of V to dest GRF + mov (8) uwDEST_V(0)<1> uwAVS_RESPONSE(6,0)<8;4,1> + mov (8) uwDEST_V(1)<1> uwAVS_RESPONSE(6,8)<8;4,1> + mov (8) uwDEST_V(2)<1> uwAVS_RESPONSE(7,0)<8;4,1> + mov (8) uwDEST_V(3)<1> uwAVS_RESPONSE(7,8)<8;4,1> + mov (8) uwDEST_V(4)<1> uwAVS_RESPONSE(10,0)<8;4,1> + mov (8) uwDEST_V(5)<1> uwAVS_RESPONSE(10,8)<8;4,1> + mov (8) uwDEST_V(6)<1> uwAVS_RESPONSE(11,0)<8;4,1> + mov (8) uwDEST_V(7)<1> uwAVS_RESPONSE(11,8)<8;4,1> + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> uwAVS_RESPONSE_2(%1,0)<8;4,1> + mov (8) uwDEST_Y(%1*2+1,8)<1> uwAVS_RESPONSE_2(%1,8)<8;4,1> + } + + // Move 2st 8x8 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0,8)<1> uwAVS_RESPONSE_2(4,0)<8;4,1> + mov (8) uwDEST_U(1,8)<1> uwAVS_RESPONSE_2(4,8)<8;4,1> + mov (8) uwDEST_U(2,8)<1> uwAVS_RESPONSE_2(5,0)<8;4,1> + mov (8) uwDEST_U(3,8)<1> uwAVS_RESPONSE_2(5,8)<8;4,1> + mov (8) uwDEST_U(4,8)<1> uwAVS_RESPONSE_2(8,0)<8;4,1> + mov (8) uwDEST_U(5,8)<1> uwAVS_RESPONSE_2(8,8)<8;4,1> + mov (8) uwDEST_U(6,8)<1> uwAVS_RESPONSE_2(9,0)<8;4,1> + mov (8) uwDEST_U(7,8)<1> uwAVS_RESPONSE_2(9,8)<8;4,1> + + // Move 2st 8x8 words of V to dest GRF + mov (8) uwDEST_V(0,8)<1> uwAVS_RESPONSE_2(6,0)<8;4,1> + mov (8) uwDEST_V(1,8)<1> uwAVS_RESPONSE_2(6,8)<8;4,1> + mov (8) uwDEST_V(2,8)<1> uwAVS_RESPONSE_2(7,0)<8;4,1> + mov (8) uwDEST_V(3,8)<1> uwAVS_RESPONSE_2(7,8)<8;4,1> + mov (8) uwDEST_V(4,8)<1> uwAVS_RESPONSE_2(10,0)<8;4,1> + mov (8) uwDEST_V(5,8)<1> uwAVS_RESPONSE_2(10,8)<8;4,1> + mov (8) uwDEST_V(6,8)<1> uwAVS_RESPONSE_2(11,0)<8;4,1> + mov (8) uwDEST_V(7,8)<1> uwAVS_RESPONSE_2(11,8)<8;4,1> +*/ +#else + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 1st 8x8 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(4,8+1)<16;4,2> + mov (8) uwDEST_U(2)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) uwDEST_U(3)<1> ubAVS_RESPONSE(5,8+1)<16;4,2> + mov (8) uwDEST_U(4)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_U(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) uwDEST_U(6)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_U(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + + // Move 1st 8x8 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(6,8+1)<16;4,2> + mov (8) uwDEST_V(2)<1> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) uwDEST_V(3)<1> ubAVS_RESPONSE(7,8+1)<16;4,2> + mov (8) uwDEST_V(4)<1> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) uwDEST_V(5)<1> ubAVS_RESPONSE(10,8+1)<16;4,2> + mov (8) uwDEST_V(6)<1> ubAVS_RESPONSE(11,1)<16;4,2> + mov (8) uwDEST_V(7)<1> ubAVS_RESPONSE(11,8+1)<16;4,2> + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 2st 8x8 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE_2(4,1)<16;4,2> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE_2(4,8+1)<16;4,2> + mov (8) uwDEST_U(2,8)<1> ubAVS_RESPONSE_2(5,1)<16;4,2> + mov (8) uwDEST_U(3,8)<1> ubAVS_RESPONSE_2(5,8+1)<16;4,2> + mov (8) uwDEST_U(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) uwDEST_U(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) uwDEST_U(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) uwDEST_U(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + + // Move 2st 8x8 words of V to dest GRF + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE_2(6,1)<16;4,2> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE_2(6,8+1)<16;4,2> + mov (8) uwDEST_V(2,8)<1> ubAVS_RESPONSE_2(7,1)<16;4,2> + mov (8) uwDEST_V(3,8)<1> ubAVS_RESPONSE_2(7,8+1)<16;4,2> + mov (8) uwDEST_V(4,8)<1> ubAVS_RESPONSE_2(10,1)<16;4,2> + mov (8) uwDEST_V(5,8)<1> ubAVS_RESPONSE_2(10,8+1)<16;4,2> + mov (8) uwDEST_V(6,8)<1> ubAVS_RESPONSE_2(11,1)<16;4,2> + mov (8) uwDEST_V(7,8)<1> ubAVS_RESPONSE_2(11,8+1)<16;4,2> +#endif + + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm new file mode 100644 index 0000000..37202f4 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x4.asm @@ -0,0 +1,45 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_8x4.asm ---------- + + // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x4 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE(9,1)<16;4,2> + + // Move 8x4 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE(11,1)<16;4,2> + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word + } + +//------------------------------------------------------------------------------ + + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 4 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm new file mode 100644 index 0000000..ec9f754 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_AVS_IEF_Unpack_8x8.asm @@ -0,0 +1,53 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_AVS_IEF_8x8.asm ---------- + + // Move first 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x8 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE(4,8+1)<16;4,2> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE(5,8+1)<16;4,2> + mov (8) uwDEST_U(2)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_U(2,8)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) uwDEST_U(3)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_U(3,8)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + + // Move 8x8 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE(6,8+1)<16;4,2> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE(7,8+1)<16;4,2> + mov (8) uwDEST_V(2)<1> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) uwDEST_V(2,8)<1> ubAVS_RESPONSE(10,8+1)<16;4,2> + mov (8) uwDEST_V(3)<1> ubAVS_RESPONSE(11,1)<16;4,2> + mov (8) uwDEST_V(3,8)<1> ubAVS_RESPONSE(11,8+1)<16;4,2> + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each GRF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word + } + +//------------------------------------------------------------------------------ + + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm new file mode 100644 index 0000000..7849afd --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL2_Scaling.asm @@ -0,0 +1,71 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL2_Scaling.asm ---------- +#include "Scaling.inc" + + // Build 16 elements ramp in float32 and normalized it +// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v +// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f +mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf //3, 2, 1, 0 in float vector +mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf //7, 6, 5, 4 in float vector +add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f + +//Module: PrepareScaleCoord.asm + + // Setup for sampler msg hdr + mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields + mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset + + // Calculate 16 v based on the step Y and vertical origin + mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f + mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f + + // Calculate 16 u based on the step X and hori origin +// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly + mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr } + mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr } + + //Setup the constants for line instruction + mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk } + mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk } + +//------------------------------------------------------------------------------ + +$for (0; <nY_NUM_OF_ROWS; 1) { + + // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA). + mov (8) MSGHDR_SCALE.0:ud rMSGSRC.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y + send (16) SCALE_RESPONSE_UW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_UV+nBI_CURRENT_SRC_UV + + // Calculate 16 v for next line + add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + + // Scale back to [0, 255], convert f to ud + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_UF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_UD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_UF(2) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_UD(2)<1> acc0:f { Compr } + + mov (16) DEST_Y(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK + mov (16) DEST_U(%1)<1> SCALE_RESPONSE_UB(0) //possible error due to truncation - vK + mov (16) DEST_V(%1)<1> SCALE_RESPONSE_UB(2) //possible error due to truncation - vK + +} + + #define nSRC_REGION nREGION_1 + +//------------------------------------------------------------------------------ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm new file mode 100644 index 0000000..50a050c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_16x8.asm @@ -0,0 +1,69 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_16x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y surface +// 2 sampler read for 8x8 U surface +// 2 sampler read for 8x8 V surface +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // 1st 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 1st 8x8 U sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Enable red channel + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U + // Return U in 4 GRFs + + // 1st 8x8 V sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Dummy instruction to avoid back-2-back send instructions + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(8)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V + // Return V in 4 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 2nd 8x8 U sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Enable red channel + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U + // Return U in 4 GRFs + + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Dummy instruction just in order to avoid back-2-back send instructions! + + // 2nd 8x8 V sampling + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(8)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V + // Return V in 4 GRFs + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:4:4 internal planar +//------------------------------------------------------------------------------ + #include "PL3_AVS_IEF_Unpack_16x8.asm" + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm new file mode 100644 index 0000000..35a5dd3 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x4.asm @@ -0,0 +1,60 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_8x4.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y surface +// 1 sampler read for 8x8 U surface +// 1 sampler read for 8x8 V surface +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // 1st 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 8x8 U sampling ; Only 8x4 will be used + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Enable red channel + mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Calculate Step X for chroma + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U + // Return U in 4 GRFs + + // 8x8 V sampling ; Only 8x4 will be used + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Dummy instruction just in order to avoid back-2-back send instructions! + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(8)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V + // Return V in 4 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f // Restore Step X for luma + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(12)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:2:0 internal planar +//------------------------------------------------------------------------------ + #include "PL3_AVS_IEF_Unpack_8x4.asm" + + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm new file mode 100644 index 0000000..d67ad04 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_8x8.asm @@ -0,0 +1,60 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_8x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 Y surface +// 1 sampler read for 8x8 U surface +// 1 sampler read for 8x8 V surface +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + // 1st 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + + // 8x8 U sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Enable red channel + mul (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f 2.0:f // Calculate Step X for chroma + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(4)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_U+nBI_CURRENT_SRC_U + // Return U in 4 GRFs + + // 8x8 V sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_RED_CHANNEL_ONLY:ud // Dummy instruction just in order to avoid back-2-back send instructions! + mov (16) mAVS_8x8_HDR_UV.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(8)<1> mAVS_8x8_HDR_UV udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_V+nBI_CURRENT_SRC_V + // Return V in 4 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + + // 2nd 8x8 Y sampling + mov (1) rAVS_8x8_HDR.2:ud nAVS_GREEN_CHANNEL_ONLY:ud // Enable green channel + mov (1) rAVS_PAYLOAD.1:f fVIDEO_STEP_X:f // Restore Step X for luma + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(12)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_1CH+nSI_SRC_Y+nBI_CURRENT_SRC_Y + // Return Y in 4 GRFs + +//------------------------------------------------------------------------------ +// Unpacking sampler reads to 4:2:2 internal planar +//------------------------------------------------------------------------------ + #include "PL3_AVS_IEF_Unpack_8x8.asm" + + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm new file mode 100644 index 0000000..f88ab89 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_16x8.asm @@ -0,0 +1,240 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_Unpack_16x8.asm ---------- + +#ifdef AVS_OUTPUT_16_BIT //Output is packed in AVYU format +// Move first 8x8 words of Y to dest GRF (as packed) + mov (4) uwDEST_Y(0,1)<4> uwAVS_RESPONSE(0,0)<4;4,1> + mov (4) uwDEST_Y(1,1)<4> uwAVS_RESPONSE(0,8)<4;4,1> + mov (4) uwDEST_Y(4,1)<4> uwAVS_RESPONSE(0,4)<4;4,1> + mov (4) uwDEST_Y(5,1)<4> uwAVS_RESPONSE(0,12)<4;4,1> + mov (4) uwDEST_Y(8,1)<4> uwAVS_RESPONSE(1,0)<4;4,1> + mov (4) uwDEST_Y(9,1)<4> uwAVS_RESPONSE(1,8)<4;4,1> + mov (4) uwDEST_Y(12,1)<4> uwAVS_RESPONSE(1,4)<4;4,1> + mov (4) uwDEST_Y(13,1)<4> uwAVS_RESPONSE(1,12)<4;4,1> + mov (4) uwDEST_Y(16,1)<4> uwAVS_RESPONSE(2,0)<4;4,1> + mov (4) uwDEST_Y(17,1)<4> uwAVS_RESPONSE(2,8)<4;4,1> + mov (4) uwDEST_Y(20,1)<4> uwAVS_RESPONSE(2,4)<4;4,1> + mov (4) uwDEST_Y(21,1)<4> uwAVS_RESPONSE(2,12)<4;4,1> + mov (4) uwDEST_Y(24,1)<4> uwAVS_RESPONSE(3,0)<4;4,1> + mov (4) uwDEST_Y(25,1)<4> uwAVS_RESPONSE(3,8)<4;4,1> + mov (4) uwDEST_Y(28,1)<4> uwAVS_RESPONSE(3,4)<4;4,1> + mov (4) uwDEST_Y(29,1)<4> uwAVS_RESPONSE(3,12)<4;4,1> + +// Move first 8x8 words of U to dest GRF (as packed) + mov (4) uwDEST_Y(0,0)<4> uwAVS_RESPONSE(4,0)<4;4,1> + mov (4) uwDEST_Y(1,0)<4> uwAVS_RESPONSE(4,8)<4;4,1> + mov (4) uwDEST_Y(4,0)<4> uwAVS_RESPONSE(4,4)<4;4,1> + mov (4) uwDEST_Y(5,0)<4> uwAVS_RESPONSE(4,12)<4;4,1> + mov (4) uwDEST_Y(8,0)<4> uwAVS_RESPONSE(5,0)<4;4,1> + mov (4) uwDEST_Y(9,0)<4> uwAVS_RESPONSE(5,8)<4;4,1> + mov (4) uwDEST_Y(12,0)<4> uwAVS_RESPONSE(5,4)<4;4,1> + mov (4) uwDEST_Y(13,0)<4> uwAVS_RESPONSE(5,12)<4;4,1> + mov (4) uwDEST_Y(16,0)<4> uwAVS_RESPONSE(6,0)<4;4,1> + mov (4) uwDEST_Y(17,0)<4> uwAVS_RESPONSE(6,8)<4;4,1> + mov (4) uwDEST_Y(20,0)<4> uwAVS_RESPONSE(6,4)<4;4,1> + mov (4) uwDEST_Y(21,0)<4> uwAVS_RESPONSE(6,12)<4;4,1> + mov (4) uwDEST_Y(24,0)<4> uwAVS_RESPONSE(7,0)<4;4,1> + mov (4) uwDEST_Y(25,0)<4> uwAVS_RESPONSE(7,8)<4;4,1> + mov (4) uwDEST_Y(28,0)<4> uwAVS_RESPONSE(7,4)<4;4,1> + mov (4) uwDEST_Y(29,0)<4> uwAVS_RESPONSE(7,12)<4;4,1> + +// Move first 8x8 words of V to dest GRF (as packed) + mov (4) uwDEST_Y(0,2)<4> uwAVS_RESPONSE(8,0)<4;4,1> + mov (4) uwDEST_Y(1,2)<4> uwAVS_RESPONSE(8,8)<4;4,1> + mov (4) uwDEST_Y(4,2)<4> uwAVS_RESPONSE(8,4)<4;4,1> + mov (4) uwDEST_Y(5,2)<4> uwAVS_RESPONSE(8,12)<4;4,1> + mov (4) uwDEST_Y(8,2)<4> uwAVS_RESPONSE(9,0)<4;4,1> + mov (4) uwDEST_Y(9,2)<4> uwAVS_RESPONSE(9,8)<4;4,1> + mov (4) uwDEST_Y(12,2)<4> uwAVS_RESPONSE(9,4)<4;4,1> + mov (4) uwDEST_Y(13,2)<4> uwAVS_RESPONSE(9,12)<4;4,1> + mov (4) uwDEST_Y(16,2)<4> uwAVS_RESPONSE(10,0)<4;4,1> + mov (4) uwDEST_Y(17,2)<4> uwAVS_RESPONSE(10,8)<4;4,1> + mov (4) uwDEST_Y(20,2)<4> uwAVS_RESPONSE(10,4)<4;4,1> + mov (4) uwDEST_Y(21,2)<4> uwAVS_RESPONSE(10,12)<4;4,1> + mov (4) uwDEST_Y(24,2)<4> uwAVS_RESPONSE(11,0)<4;4,1> + mov (4) uwDEST_Y(25,2)<4> uwAVS_RESPONSE(11,8)<4;4,1> + mov (4) uwDEST_Y(28,2)<4> uwAVS_RESPONSE(11,4)<4;4,1> + mov (4) uwDEST_Y(29,2)<4> uwAVS_RESPONSE(11,12)<4;4,1> + +// Move first 8x8 words of A to dest GRF (as packed) + mov (4) uwDEST_Y(0,3)<4> 0:uw + mov (4) uwDEST_Y(1,3)<4> 0:uw + mov (4) uwDEST_Y(4,3)<4> 0:uw + mov (4) uwDEST_Y(5,3)<4> 0:uw + mov (4) uwDEST_Y(8,3)<4> 0:uw + mov (4) uwDEST_Y(9,3)<4> 0:uw + mov (4) uwDEST_Y(12,3)<4> 0:uw + mov (4) uwDEST_Y(13,3)<4> 0:uw + mov (4) uwDEST_Y(16,3)<4> 0:uw + mov (4) uwDEST_Y(17,3)<4> 0:uw + mov (4) uwDEST_Y(20,3)<4> 0:uw + mov (4) uwDEST_Y(21,3)<4> 0:uw + mov (4) uwDEST_Y(24,3)<4> 0:uw + mov (4) uwDEST_Y(25,3)<4> 0:uw + mov (4) uwDEST_Y(28,3)<4> 0:uw + mov (4) uwDEST_Y(29,3)<4> 0:uw + +// Move second 8x8 words of Y to dest GRF + mov (4) uwDEST_Y(2,1)<4> uwAVS_RESPONSE_2(0,0)<4;4,1> + mov (4) uwDEST_Y(3,1)<4> uwAVS_RESPONSE_2(0,8)<4;4,1> + mov (4) uwDEST_Y(6,1)<4> uwAVS_RESPONSE_2(0,4)<4;4,1> + mov (4) uwDEST_Y(7,1)<4> uwAVS_RESPONSE_2(0,12)<4;4,1> + mov (4) uwDEST_Y(10,1)<4> uwAVS_RESPONSE_2(1,0)<4;4,1> + mov (4) uwDEST_Y(11,1)<4> uwAVS_RESPONSE_2(1,8)<4;4,1> + mov (4) uwDEST_Y(14,1)<4> uwAVS_RESPONSE_2(1,4)<4;4,1> + mov (4) uwDEST_Y(15,1)<4> uwAVS_RESPONSE_2(1,12)<4;4,1> + mov (4) uwDEST_Y(18,1)<4> uwAVS_RESPONSE_2(2,0)<4;4,1> + mov (4) uwDEST_Y(19,1)<4> uwAVS_RESPONSE_2(2,8)<4;4,1> + mov (4) uwDEST_Y(22,1)<4> uwAVS_RESPONSE_2(2,4)<4;4,1> + mov (4) uwDEST_Y(23,1)<4> uwAVS_RESPONSE_2(2,12)<4;4,1> + mov (4) uwDEST_Y(26,1)<4> uwAVS_RESPONSE_2(3,0)<4;4,1> + mov (4) uwDEST_Y(27,1)<4> uwAVS_RESPONSE_2(3,8)<4;4,1> + mov (4) uwDEST_Y(30,1)<4> uwAVS_RESPONSE_2(3,4)<4;4,1> + mov (4) uwDEST_Y(31,1)<4> uwAVS_RESPONSE_2(3,12)<4;4,1> + +// Move second 8x8 words of U to dest GRF + mov (4) uwDEST_Y(2,0)<4> uwAVS_RESPONSE_2(4,0)<4;4,1> + mov (4) uwDEST_Y(3,0)<4> uwAVS_RESPONSE_2(4,8)<4;4,1> + mov (4) uwDEST_Y(6,0)<4> uwAVS_RESPONSE_2(4,4)<4;4,1> + mov (4) uwDEST_Y(7,0)<4> uwAVS_RESPONSE_2(4,12)<4;4,1> + mov (4) uwDEST_Y(10,0)<4> uwAVS_RESPONSE_2(5,0)<4;4,1> + mov (4) uwDEST_Y(11,0)<4> uwAVS_RESPONSE_2(5,8)<4;4,1> + mov (4) uwDEST_Y(14,0)<4> uwAVS_RESPONSE_2(5,4)<4;4,1> + mov (4) uwDEST_Y(15,0)<4> uwAVS_RESPONSE_2(5,12)<4;4,1> + mov (4) uwDEST_Y(18,0)<4> uwAVS_RESPONSE_2(6,0)<4;4,1> + mov (4) uwDEST_Y(19,0)<4> uwAVS_RESPONSE_2(6,8)<4;4,1> + mov (4) uwDEST_Y(22,0)<4> uwAVS_RESPONSE_2(6,4)<4;4,1> + mov (4) uwDEST_Y(23,0)<4> uwAVS_RESPONSE_2(6,12)<4;4,1> + mov (4) uwDEST_Y(26,0)<4> uwAVS_RESPONSE_2(7,0)<4;4,1> + mov (4) uwDEST_Y(27,0)<4> uwAVS_RESPONSE_2(7,8)<4;4,1> + mov (4) uwDEST_Y(30,0)<4> uwAVS_RESPONSE_2(7,4)<4;4,1> + mov (4) uwDEST_Y(31,0)<4> uwAVS_RESPONSE_2(7,12)<4;4,1> + +// Move second 8x8 words of V to dest GRF + mov (4) uwDEST_Y(2,2)<4> uwAVS_RESPONSE_2(8,0)<4;4,1> + mov (4) uwDEST_Y(3,2)<4> uwAVS_RESPONSE_2(8,8)<4;4,1> + mov (4) uwDEST_Y(6,2)<4> uwAVS_RESPONSE_2(8,4)<4;4,1> + mov (4) uwDEST_Y(7,2)<4> uwAVS_RESPONSE_2(8,12)<4;4,1> + mov (4) uwDEST_Y(10,2)<4> uwAVS_RESPONSE_2(9,0)<4;4,1> + mov (4) uwDEST_Y(11,2)<4> uwAVS_RESPONSE_2(9,8)<4;4,1> + mov (4) uwDEST_Y(14,2)<4> uwAVS_RESPONSE_2(9,4)<4;4,1> + mov (4) uwDEST_Y(15,2)<4> uwAVS_RESPONSE_2(9,12)<4;4,1> + mov (4) uwDEST_Y(18,2)<4> uwAVS_RESPONSE_2(10,0)<4;4,1> + mov (4) uwDEST_Y(19,2)<4> uwAVS_RESPONSE_2(10,8)<4;4,1> + mov (4) uwDEST_Y(22,2)<4> uwAVS_RESPONSE_2(10,4)<4;4,1> + mov (4) uwDEST_Y(23,2)<4> uwAVS_RESPONSE_2(10,12)<4;4,1> + mov (4) uwDEST_Y(26,2)<4> uwAVS_RESPONSE_2(11,0)<4;4,1> + mov (4) uwDEST_Y(27,2)<4> uwAVS_RESPONSE_2(11,8)<4;4,1> + mov (4) uwDEST_Y(30,2)<4> uwAVS_RESPONSE_2(11,4)<4;4,1> + mov (4) uwDEST_Y(31,2)<4> uwAVS_RESPONSE_2(11,12)<4;4,1> + +// Move second 8x8 words of A to dest GRF + mov (4) uwDEST_Y(2,3)<4> 0:uw + mov (4) uwDEST_Y(3,3)<4> 0:uw + mov (4) uwDEST_Y(6,3)<4> 0:uw + mov (4) uwDEST_Y(7,3)<4> 0:uw + mov (4) uwDEST_Y(10,3)<4> 0:uw + mov (4) uwDEST_Y(11,3)<4> 0:uw + mov (4) uwDEST_Y(14,3)<4> 0:uw + mov (4) uwDEST_Y(15,3)<4> 0:uw + mov (4) uwDEST_Y(18,3)<4> 0:uw + mov (4) uwDEST_Y(19,3)<4> 0:uw + mov (4) uwDEST_Y(22,3)<4> 0:uw + mov (4) uwDEST_Y(23,3)<4> 0:uw + mov (4) uwDEST_Y(26,3)<4> 0:uw + mov (4) uwDEST_Y(27,3)<4> 0:uw + mov (4) uwDEST_Y(30,3)<4> 0:uw + mov (4) uwDEST_Y(31,3)<4> 0:uw + +/* This section will be used if 16-bit output is needed in planar format -vK + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> uwAVS_RESPONSE(%1)<8;4,1> + mov (8) uwDEST_Y(%1*2+1)<1> uwAVS_RESPONSE(%1,8)<8;4,1> + } + + // Move 8x8 words of U to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_U(%1*2)<1> uwAVS_RESPONSE(%1+4)<8;4,1> + mov (8) uwDEST_U(%1*2+1)<1> uwAVS_RESPONSE(%1+4,8)<8;4,1> + } + + // Move 8x8 words of V to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_V(%1*2)<1> uwAVS_RESPONSE(%1+8)<8;4,1> + mov (8) uwDEST_V(%1*2+1)<1> uwAVS_RESPONSE(%1+8,8)<8;4,1> + } + + // Move 2nd 8x8 words of Y to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> uwAVS_RESPONSE_2(%1)<8;4,1> + mov (8) uwDEST_Y(%1*2+1,8)<1> uwAVS_RESPONSE_2(%1,8)<8;4,1> + } + + // Move 2nd 8x8 words of U to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_U(%1*2,8)<1> uwAVS_RESPONSE_2(%1+4)<8;4,1> + mov (8) uwDEST_U(%1*2+1,8)<1> uwAVS_RESPONSE_2(%1+4,8)<8;4,1> + } + + // Move 2nd 8x8 words of V to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_V(%1*2,8)<1> uwAVS_RESPONSE_2(%1+8)<8;4,1> + mov (8) uwDEST_V(%1*2+1,8)<1> uwAVS_RESPONSE_2(%1+8,8)<8;4,1> + } +*/ +#else /* OUTPUT_8_BIT */ + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x8 words of U to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_U(%1*2)<1> ubAVS_RESPONSE(%1+4,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_U(%1*2+1)<1> ubAVS_RESPONSE(%1+4,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x8 words of V to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_V(%1*2)<1> ubAVS_RESPONSE(%1+8,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_V(%1*2+1)<1> ubAVS_RESPONSE(%1+8,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE_2(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 2nd 8x8 words of U to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_U(%1*2,8)<1> ubAVS_RESPONSE_2(%1+4,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_U(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1+4,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 2nd 8x8 words of V to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_V(%1*2,8)<1> ubAVS_RESPONSE_2(%1+8,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_V(%1*2+1,8)<1> ubAVS_RESPONSE_2(%1+8,8+1)<16;4,2> // Copy high byte in a word + } +#endif +//------------------------------------------------------------------------------ + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm new file mode 100644 index 0000000..53586e6 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x4.asm @@ -0,0 +1,45 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_Unpack_8x4.asm ---------- + + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x4 words of U to dest GRF (Copy high byte in a word) + mov (8) uwDEST_U(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) uwDEST_U(0,8)<1> ubAVS_RESPONSE(4,9)<16;4,2> + mov (8) uwDEST_U(1)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) uwDEST_U(1,8)<1> ubAVS_RESPONSE(5,9)<16;4,2> + + // Move 8x4 words of V to dest GRF + mov (8) uwDEST_V(0)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) uwDEST_V(0,8)<1> ubAVS_RESPONSE(8,9)<16;4,2> + mov (8) uwDEST_V(1)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) uwDEST_V(1,8)<1> ubAVS_RESPONSE(9,9)<16;4,2> + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE(%1+12,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE(%1+12,8+1)<16;4,2> // Copy high byte in a word + } + +//------------------------------------------------------------------------------ + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 4 + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm new file mode 100644 index 0000000..f16d04a --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_AVS_IEF_Unpack_8x8.asm @@ -0,0 +1,44 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_AVS_IEF_Unpack_8x8.asm ---------- + + // Move 1st 8x8 words of Y to dest GRF at lower 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2)<1> ubAVS_RESPONSE(%1,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1)<1> ubAVS_RESPONSE(%1,8+1)<16;4,2> // Copy high byte in a word + } + // Move 8x8 words of U to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_U(%1)<1> ubAVS_RESPONSE(%1+4,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_U(%1,8)<1> ubAVS_RESPONSE(%1+4,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 8x8 words of V to dest GRF + $for(0; <8/2; 1) { + mov (8) uwDEST_V(%1)<1> ubAVS_RESPONSE(%1+8,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_V(%1,8)<1> ubAVS_RESPONSE(%1+8,8+1)<16;4,2> // Copy high byte in a word + } + + // Move 2nd 8x8 words of Y to dest GRF at higher 8 words of each RGF. + $for(0; <8/2; 1) { + mov (8) uwDEST_Y(%1*2,8)<1> ubAVS_RESPONSE(%1+12,1)<16;4,2> // Copy high byte in a word + mov (8) uwDEST_Y(%1*2+1,8)<1> ubAVS_RESPONSE(%1+12,8+1)<16;4,2> // Copy high byte in a word + } + +//------------------------------------------------------------------------------ + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm new file mode 100644 index 0000000..3d5c689 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL3_Scaling.asm @@ -0,0 +1,72 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- PL3_Scaling.asm ---------- +#include "Scaling.inc" + + // Build 16 elements ramp in float32 and normalized it +// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v +// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f +mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf { NoDDClr }//3, 2, 1, 0 in float vector +mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf { NoDDChk }//7, 6, 5, 4 in float vector +add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f + + +//Module: PrepareScaleCoord.asm + + // Setup for sampler msg hdr + mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields + mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset + + // Calculate 16 v based on the step Y and vertical origin + mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f + mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f + + // Calculate 16 u based on the step X and hori origin +// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly + mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr } + mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr } + + //Setup the constants for line instruction + mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk } + mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk } + +//------------------------------------------------------------------------------ + +$for (0; <nY_NUM_OF_ROWS; 1) { + // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA). + mov (8) MSGHDR_SCALE<1>:ud rMSGSRC<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (16) SCALE_RESPONSE_VW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_V+nBI_CURRENT_SRC_V + send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_Y+nBI_CURRENT_SRC_Y + send (16) SCALE_RESPONSE_UW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_U+nBI_CURRENT_SRC_U + + // Calculate 16 v for next line + add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + + // Scale back to [0, 255], convert f to ud + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_VF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_VD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_UF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_UD(0)<1> acc0:f { Compr } + + mov (16) DEST_V(%1)<1> SCALE_RESPONSE_VB(0) //possible error due to truncation - vK + mov (16) DEST_Y(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK + mov (16) DEST_U(%1)<1> SCALE_RESPONSE_UB(0) //possible error due to truncation - vK + +} + + #define nSRC_REGION nREGION_1 + +//------------------------------------------------------------------------------ diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm new file mode 100644 index 0000000..e6d8fb2 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG.asm @@ -0,0 +1,85 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #ifdef DI_ONLY + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DI // set the number of GRF + #else + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #endif + + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // DN Block Size for Write is 16x4 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_Command.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w NODDCLR_NODDCHK // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w NODDCLR_NODDCHK // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud NODDCHK // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +#ifdef DI_ONLY +#else + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Save the DN Curr Frame for Next Run //////////////////////// + + //set the save DN parameters + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w NODDCLR // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud NODDCLR_NODDCHK // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2) + } + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2) + } + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + +SAVE_DN_CURR: + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud +#endif + +// Save Processed frames +#include "DI_Save_PA.asm" + + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm new file mode 100644 index 0000000..96aed78 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV11.asm @@ -0,0 +1,103 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #undef nY_NUM_OF_ROWS + #define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame) + #undef nUV_NUM_OF_ROWS + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // DN Block Size for Write is 16x4 + #undef nDPR_BLOCK_SIZE_UV + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_4 // DN Block Size for UV Write/Read is 8x4 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_Command.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + // move the previous frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + } + // move the previous frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + // move the current frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + } + // move the current frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Save the DN Curr Frame for Next Run //////////////////////// + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:w + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3) + } +SAVE_DN_CURR: + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud + + +/////////////////////////////P208 UV Copy 422///////////////////////////////////////////////////// + //Read UV through DATAPORT + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x2) + mov (8) mudMSGHDR_DN<1> rMSGSRC<8;8,1>:ud + send (8) udBOT_U_IO(0)<1> mMSGHDR_DN udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud + + //Write UV through DATAPORT + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + asr (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // block width and height (16x2) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + mov (8) mudMSGHDR_DN(1)<1> udBOT_U_IO(0)<8;8,1> + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm new file mode 100644 index 0000000..69330ba --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_NV12.asm @@ -0,0 +1,103 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #undef nY_NUM_OF_ROWS + #define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame) + #undef nUV_NUM_OF_ROWS + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // DN Block Size for Write is 16x4 + #undef nDPR_BLOCK_SIZE_UV + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_16+nBLOCK_HEIGHT_2 // DN Block Size for UV Write/Read is 16x2 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_COMMAND.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + // move the previous frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + } + // move the previous frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + // move the current frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + } + // move the current frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Save the DN Curr Frame for Next Run //////////////////////// + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:w + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3) + } +SAVE_DN_CURR: + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud + + +/////////////////////////////NV12 UV Copy 422///////////////////////////////////////////////////// + //Read UV through DATAPORT + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (16x2) + mov (8) mudMSGHDR_DN<1> rMSGSRC<8;8,1>:ud + send (8) udBOT_U_IO(0)<1> mMSGHDR_DN udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_UV:ud + + //Write UV through DATAPORT + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + asr (1) rMSGSRC.1<1>:d rMSGSRC.1<0;1,0>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // block width and height (16x2) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + mov (8) mudMSGHDR_DN(1)<1> udBOT_U_IO(0)<8;8,1> + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_UV:ud
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm new file mode 100644 index 0000000..7fba14c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_P208.asm @@ -0,0 +1,101 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #undef nY_NUM_OF_ROWS + #define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame) + #undef nUV_NUM_OF_ROWS + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // DN Block Size for Write is 16x4 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_Command.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + // move the previous frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + } + // move the previous frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + // move the current frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + } + // move the current frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Save the DN Curr Frame for Next Run //////////////////////// + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:w + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3) + } +SAVE_DN_CURR: + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud + + +/////////////////////////////P208 UV Copy 422///////////////////////////////////////////////////// + //Read UV through DATAPORT + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // Y Block width and height (16x4) (U/V block size is the same) + mov (8) mudMSGHDR_DN<1> rMSGSRC<8;8,1>:ud + send (8) udBOT_U_IO(0)<1> mMSGHDR_DN udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nRESLEN_2+nBI_CURRENT_SRC_UV:ud + + //Write UV through DATAPORT + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + mov (8) mudMSGHDR_DN(1)<1> udBOT_U_IO(0)<8;8,1> + mov (8) mudMSGHDR_DN(2)<1> udBOT_U_IO(1)<8;8,1> + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_UV:ud +
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm new file mode 100644 index 0000000..f7b891d --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DNDI_ALG_UVCopy_PL3.asm @@ -0,0 +1,106 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_ENABLE + + #include "DNDI.inc" + + #undef nY_NUM_OF_ROWS + #define nY_NUM_OF_ROWS 8 // Number of Y rows per block (4 rows for each frame) + #undef nUV_NUM_OF_ROWS + #define nUV_NUM_OF_ROWS 8 // Number of U/V rows per block + + #undef nSMPL_RESP_LEN + #define nSMPL_RESP_LEN nSMPL_RESP_LEN_DNDI // set the number of GRF + #undef nDPW_BLOCK_SIZE_HIST + #define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_1 // HIST Block Size for Write is 4x2 + #undef nDPW_BLOCK_SIZE_DN + #define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_4 // DN Block Size for Write is 16x4 + #undef nDPR_BLOCK_SIZE_UV + #define nDPR_BLOCK_SIZE_UV nBLOCK_WIDTH_8+nBLOCK_HEIGHT_2 // DN Block Size for UV Write/Read is 8x2 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// + #include "DNDI_Command.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// + // move the previous frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1,0)<1> ubRESP(nDI_PREV_FRAME_LUMA_OFFSET,%1*16) + } + // move the previous frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(0,%1*8)<1> ubRESP(nDI_PREV_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + // move the current frame Y component to internal planar format + $for (0; <nY_NUM_OF_ROWS/2; 1) { + mov (16) uwDEST_Y(%1+4,0)<1> ubRESP(nDI_CURR_FRAME_LUMA_OFFSET,%1*16) + } + // move the current frame U,V components to internal planar format + $for (0; <nUV_NUM_OF_ROWS/2; 1) { + mov (8) uwDEST_U(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16+1)<16;8,2> //U pixels + mov (8) uwDEST_V(2,%1*8)<1> ubRESP(nDI_CURR_FRAME_CHROMA_OFFSET,%1*16)<16;8,2> //V pixels + } + +////////////////////////////////////// Save the STMM Data for Next Run ///////////////////////// + // Write STMM to memory + shr (1) rMSGSRC.0<1>:ud wORIX<0;1,0>:w 1:w // X origin / 2 + mov (1) rMSGSRC.1<1>:ud wORIY<0;1,0>:w // Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_STMM:ud // block width and height (8x4) + mov (8) mudMSGHDR_STMM(0)<1> rMSGSRC.0<8;8,1>:ud // message header + mov (8) mudMSGHDR_STMM(1)<1> udRESP(nDI_STMM_OFFSET,0) // Move STMM to MRF + send (8) dNULLREG mMSGHDR_STMM udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_STMM+nBI_STMM_HISTORY_OUTPUT:ud + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// + #include "DI_Hist_Save.asm" + +////////////////////////////////////// Save the DN Curr Frame for Next Run //////////////////////// + add (4) pCF_Y_OFFSET<1>:uw ubSRC_CF_OFFSET<4;4,1>:ub npDN_YUV:w + // check top/bottom field first + cmp.e.f0.0 (1) null<1>:w ubTFLD_FIRST<0;1,0>:ub 1:w + (f0.0) jmpi (1) TOP_FIELD_FIRST + +BOTTOM_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,4)<4;4,1> // 1st field luma from current frame (line 1,3) + } + jmpi (1) SAVE_DN_CURR + +TOP_FIELD_FIRST: + $for (0,0; <nY_NUM_OF_ROWS/2; 2,1) { + mov (4) mudMSGHDR_DN(1,%1*4)<1> udRESP(nDI_CURR_FRAME_LUMA_OFFSET+%2,0)<4;4,1> // 2nd field luma from current frame (line 0,2) + mov (4) mudMSGHDR_DN(1,%1*4+4)<1> udRESP(nDI_CURR_2ND_FIELD_LUMA_OFFSET,%2*4)<4;4,1> // 1st field luma from current frame (line 1,3) + } +SAVE_DN_CURR: + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + mov (1) rMSGSRC.2<1>:ud nDPW_BLOCK_SIZE_DN:ud // block width and height (16x4) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + send (8) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nDPMW_MSG_LEN_PL_DN_DI+nBI_DESTINATION_Y:ud + + +/////////////////////////////IMC3 UV Copy 422///////////////////////////////////////////////////// + //Read UV through DATAPORT + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Y Block origin + asr (2) rMSGSRC.0<1>:d rMSGSRC.0<2;2,1>:d 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // U/V block width and height (8x2) + mov (8) mudMSGHDR_DN<1> rMSGSRC<8;8,1>:ud + send (4) udBOT_U_IO(0)<1> mMSGHDR_DN udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_U:ud + send (4) udBOT_V_IO(0)<1> mMSGHDR_DN udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nRESLEN_1+nBI_CURRENT_SRC_V:ud + + //Write UV through DATAPORT + mov (2) rMSGSRC.0<1>:ud wORIX<2;2,1>:w // X origin and Y origin + asr (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w 1:w // U/V block origin should be half of Y's + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_UV:ud // block width and height (8x2) + mov (8) mudMSGHDR_DN(0)<1> rMSGSRC.0<8;8,1>:ud + mov (4) mudMSGHDR_DN(1)<1> udBOT_U_IO(0)<4;4,1> + send (4) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_U:ud + mov (4) mudMSGHDR_DN(1)<1> udBOT_V_IO(0)<4;4,1> + send (4) dNULLREG mMSGHDR_DN udDUMMY_NULL nDATAPORT_WRITE nDPMW_MSGDSC+nMSGLEN_1+nBI_DESTINATION_V:ud
\ No newline at end of file diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm new file mode 100644 index 0000000..0b9aa4c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/PL_DN_ALG.asm @@ -0,0 +1,35 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +#define DI_DISABLE + +#include "DNDI.inc" + +#undef nY_NUM_OF_ROWS +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block + +#undef nSMPL_RESP_LEN +#define nSMPL_RESP_LEN nSMPL_RESP_LEN_DN_PL // Set the Number of GRFs in DNDI response +#undef nDPW_BLOCK_SIZE_DN +#define nDPW_BLOCK_SIZE_DN nBLOCK_WIDTH_16+nBLOCK_HEIGHT_8 // DN Curr Block Size for Write is 16x8 +#undef nDPW_BLOCK_SIZE_HIST +#define nDPW_BLOCK_SIZE_HIST nBLOCK_WIDTH_4+nBLOCK_HEIGHT_2 // HIST Block Size for Write is 4x2 + +////////////////////////////////////// Run the DN Algorithm /////////////////////////////////////// +#include "DNDI_COMMAND.asm" + +////////////////////////////////////// Rearrange for Internal Planar ////////////////////////////// +$for (0; <nY_NUM_OF_ROWS; 1) { + mov (16) uwDEST_Y(0,%1*16)<1> ubRESP(nNODI_LUMA_OFFSET,%1*16)<16;16,1> // copy line of Y +} + +////////////////////////////////////// Save the History Data for Next Run ///////////////////////// +#include "DNDI_Hist_Save.asm" + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm new file mode 100644 index 0000000..efc7cd6 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_16x8.asm @@ -0,0 +1,33 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- RGB_AVS_IEF_16x8.asm ---------- + +#include "AVS_IEF.inc" + +//------------------------------------------------------------------------------ +// 2 sampler reads for 8x8 ARGB packed +//------------------------------------------------------------------------------ + + // 1st 8x8 setup + #include "AVS_SetupFirstBlock.asm" + + mov (1) rAVS_8x8_HDR.2:ud nAVS_ALL_CHANNELS:ud // Enable ARGB channels + mov (16) mAVS_8x8_HDR.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE(0)<1> mAVS_8x8_HDR udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV + // Return ARGB in 16 GRFs + + // 2nd 8x8 setup + #include "AVS_SetupSecondBlock.asm" + mov (16) mAVS_8x8_HDR_2.0:ud rAVS_8x8_HDR.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (1) uwAVS_RESPONSE_2(0)<1> mAVS_8x8_HDR_2 udDUMMY_NULL nSMPL_ENGINE nAVS_MSG_DSC_4CH+nSI_SRC_RGB+nBI_CURRENT_SRC_YUV + // Return ARGB in 16 GRFs + + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm new file mode 100644 index 0000000..6e2de97 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unpack_16x8.asm @@ -0,0 +1,251 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- RGB_AVS_IEF_Unpack_16x8.asm ---------- +#include "AVS_IEF.inc" + +#ifdef AVS_OUTPUT_16_BIT +// Move first 8x8 words of B to dest GRF (as packed) + mov (4) uwDEST_Y(0,2)<4> uwAVS_RESPONSE(4,0)<4;4,1> + mov (4) uwDEST_Y(1,2)<4> uwAVS_RESPONSE(4,8)<4;4,1> + mov (4) uwDEST_Y(4,2)<4> uwAVS_RESPONSE(4,4)<4;4,1> + mov (4) uwDEST_Y(5,2)<4> uwAVS_RESPONSE(4,12)<4;4,1> + mov (4) uwDEST_Y(8,2)<4> uwAVS_RESPONSE(5,0)<4;4,1> + mov (4) uwDEST_Y(9,2)<4> uwAVS_RESPONSE(5,8)<4;4,1> + mov (4) uwDEST_Y(12,2)<4> uwAVS_RESPONSE(5,4)<4;4,1> + mov (4) uwDEST_Y(13,2)<4> uwAVS_RESPONSE(5,12)<4;4,1> + mov (4) uwDEST_Y(16,2)<4> uwAVS_RESPONSE(12,0)<4;4,1> + mov (4) uwDEST_Y(17,2)<4> uwAVS_RESPONSE(12,8)<4;4,1> + mov (4) uwDEST_Y(20,2)<4> uwAVS_RESPONSE(12,4)<4;4,1> + mov (4) uwDEST_Y(21,2)<4> uwAVS_RESPONSE(12,12)<4;4,1> + mov (4) uwDEST_Y(24,2)<4> uwAVS_RESPONSE(13,0)<4;4,1> + mov (4) uwDEST_Y(25,2)<4> uwAVS_RESPONSE(13,8)<4;4,1> + mov (4) uwDEST_Y(28,2)<4> uwAVS_RESPONSE(13,4)<4;4,1> + mov (4) uwDEST_Y(29,2)<4> uwAVS_RESPONSE(13,12)<4;4,1> + +// Move first 8x8 words of G to dest GRF (as packed) + mov (4) uwDEST_Y(0,1)<4> uwAVS_RESPONSE(2,0)<4;4,1> + mov (4) uwDEST_Y(1,1)<4> uwAVS_RESPONSE(2,8)<4;4,1> + mov (4) uwDEST_Y(4,1)<4> uwAVS_RESPONSE(2,4)<4;4,1> + mov (4) uwDEST_Y(5,1)<4> uwAVS_RESPONSE(2,12)<4;4,1> + mov (4) uwDEST_Y(8,1)<4> uwAVS_RESPONSE(3,0)<4;4,1> + mov (4) uwDEST_Y(9,1)<4> uwAVS_RESPONSE(3,8)<4;4,1> + mov (4) uwDEST_Y(12,1)<4> uwAVS_RESPONSE(3,4)<4;4,1> + mov (4) uwDEST_Y(13,1)<4> uwAVS_RESPONSE(3,12)<4;4,1> + mov (4) uwDEST_Y(16,1)<4> uwAVS_RESPONSE(10,0)<4;4,1> + mov (4) uwDEST_Y(17,1)<4> uwAVS_RESPONSE(10,8)<4;4,1> + mov (4) uwDEST_Y(20,1)<4> uwAVS_RESPONSE(10,4)<4;4,1> + mov (4) uwDEST_Y(21,1)<4> uwAVS_RESPONSE(10,12)<4;4,1> + mov (4) uwDEST_Y(24,1)<4> uwAVS_RESPONSE(11,0)<4;4,1> + mov (4) uwDEST_Y(25,1)<4> uwAVS_RESPONSE(11,8)<4;4,1> + mov (4) uwDEST_Y(28,1)<4> uwAVS_RESPONSE(11,4)<4;4,1> + mov (4) uwDEST_Y(29,1)<4> uwAVS_RESPONSE(11,12)<4;4,1> + +// Move first 8x8 words of R to dest GRF (as packed) + mov (4) uwDEST_Y(0,0)<4> uwAVS_RESPONSE(0,0)<4;4,1> + mov (4) uwDEST_Y(1,0)<4> uwAVS_RESPONSE(0,8)<4;4,1> + mov (4) uwDEST_Y(4,0)<4> uwAVS_RESPONSE(0,4)<4;4,1> + mov (4) uwDEST_Y(5,0)<4> uwAVS_RESPONSE(0,12)<4;4,1> + mov (4) uwDEST_Y(8,0)<4> uwAVS_RESPONSE(1,0)<4;4,1> + mov (4) uwDEST_Y(9,0)<4> uwAVS_RESPONSE(1,8)<4;4,1> + mov (4) uwDEST_Y(12,0)<4> uwAVS_RESPONSE(1,4)<4;4,1> + mov (4) uwDEST_Y(13,0)<4> uwAVS_RESPONSE(1,12)<4;4,1> + mov (4) uwDEST_Y(16,0)<4> uwAVS_RESPONSE(8,0)<4;4,1> + mov (4) uwDEST_Y(17,0)<4> uwAVS_RESPONSE(8,8)<4;4,1> + mov (4) uwDEST_Y(20,0)<4> uwAVS_RESPONSE(8,4)<4;4,1> + mov (4) uwDEST_Y(21,0)<4> uwAVS_RESPONSE(8,12)<4;4,1> + mov (4) uwDEST_Y(24,0)<4> uwAVS_RESPONSE(9,0)<4;4,1> + mov (4) uwDEST_Y(25,0)<4> uwAVS_RESPONSE(9,8)<4;4,1> + mov (4) uwDEST_Y(28,0)<4> uwAVS_RESPONSE(9,4)<4;4,1> + mov (4) uwDEST_Y(29,0)<4> uwAVS_RESPONSE(9,12)<4;4,1> + +// Move first 8x8 words of A to dest GRF (as packed) + mov (4) uwDEST_Y(0,3)<4> uwAVS_RESPONSE(6,0)<4;4,1> + mov (4) uwDEST_Y(1,3)<4> uwAVS_RESPONSE(6,8)<4;4,1> + mov (4) uwDEST_Y(4,3)<4> uwAVS_RESPONSE(6,4)<4;4,1> + mov (4) uwDEST_Y(5,3)<4> uwAVS_RESPONSE(6,12)<4;4,1> + mov (4) uwDEST_Y(8,3)<4> uwAVS_RESPONSE(7,0)<4;4,1> + mov (4) uwDEST_Y(9,3)<4> uwAVS_RESPONSE(7,8)<4;4,1> + mov (4) uwDEST_Y(12,3)<4> uwAVS_RESPONSE(7,4)<4;4,1> + mov (4) uwDEST_Y(13,3)<4> uwAVS_RESPONSE(7,12)<4;4,1> + mov (4) uwDEST_Y(16,3)<4> uwAVS_RESPONSE(14,0)<4;4,1> + mov (4) uwDEST_Y(17,3)<4> uwAVS_RESPONSE(14,8)<4;4,1> + mov (4) uwDEST_Y(20,3)<4> uwAVS_RESPONSE(14,4)<4;4,1> + mov (4) uwDEST_Y(21,3)<4> uwAVS_RESPONSE(14,12)<4;4,1> + mov (4) uwDEST_Y(24,3)<4> uwAVS_RESPONSE(15,0)<4;4,1> + mov (4) uwDEST_Y(25,3)<4> uwAVS_RESPONSE(15,8)<4;4,1> + mov (4) uwDEST_Y(28,3)<4> uwAVS_RESPONSE(15,4)<4;4,1> + mov (4) uwDEST_Y(29,3)<4> uwAVS_RESPONSE(15,12)<4;4,1> + +// Move second 8x8 words of B to dest GRF + mov (4) uwDEST_Y(2,2)<4> uwAVS_RESPONSE_2(4,0)<4;4,1> + mov (4) uwDEST_Y(3,2)<4> uwAVS_RESPONSE_2(4,8)<4;4,1> + mov (4) uwDEST_Y(6,2)<4> uwAVS_RESPONSE_2(4,4)<4;4,1> + mov (4) uwDEST_Y(7,2)<4> uwAVS_RESPONSE_2(4,12)<4;4,1> + mov (4) uwDEST_Y(10,2)<4> uwAVS_RESPONSE_2(5,0)<4;4,1> + mov (4) uwDEST_Y(11,2)<4> uwAVS_RESPONSE_2(5,8)<4;4,1> + mov (4) uwDEST_Y(14,2)<4> uwAVS_RESPONSE_2(5,4)<4;4,1> + mov (4) uwDEST_Y(15,2)<4> uwAVS_RESPONSE_2(5,12)<4;4,1> + mov (4) uwDEST_Y(18,2)<4> uwAVS_RESPONSE_2(12,0)<4;4,1> + mov (4) uwDEST_Y(19,2)<4> uwAVS_RESPONSE_2(12,8)<4;4,1> + mov (4) uwDEST_Y(22,2)<4> uwAVS_RESPONSE_2(12,4)<4;4,1> + mov (4) uwDEST_Y(23,2)<4> uwAVS_RESPONSE_2(12,12)<4;4,1> + mov (4) uwDEST_Y(26,2)<4> uwAVS_RESPONSE_2(13,0)<4;4,1> + mov (4) uwDEST_Y(27,2)<4> uwAVS_RESPONSE_2(13,8)<4;4,1> + mov (4) uwDEST_Y(30,2)<4> uwAVS_RESPONSE_2(13,4)<4;4,1> + mov (4) uwDEST_Y(31,2)<4> uwAVS_RESPONSE_2(13,12)<4;4,1> + +// Move second 8x8 words of G to dest GRF + mov (4) uwDEST_Y(2,1)<4> uwAVS_RESPONSE_2(2,0)<4;4,1> + mov (4) uwDEST_Y(3,1)<4> uwAVS_RESPONSE_2(2,8)<4;4,1> + mov (4) uwDEST_Y(6,1)<4> uwAVS_RESPONSE_2(2,4)<4;4,1> + mov (4) uwDEST_Y(7,1)<4> uwAVS_RESPONSE_2(2,12)<4;4,1> + mov (4) uwDEST_Y(10,1)<4> uwAVS_RESPONSE_2(3,0)<4;4,1> + mov (4) uwDEST_Y(11,1)<4> uwAVS_RESPONSE_2(3,8)<4;4,1> + mov (4) uwDEST_Y(14,1)<4> uwAVS_RESPONSE_2(3,4)<4;4,1> + mov (4) uwDEST_Y(15,1)<4> uwAVS_RESPONSE_2(3,12)<4;4,1> + mov (4) uwDEST_Y(18,1)<4> uwAVS_RESPONSE_2(10,0)<4;4,1> + mov (4) uwDEST_Y(19,1)<4> uwAVS_RESPONSE_2(10,8)<4;4,1> + mov (4) uwDEST_Y(22,1)<4> uwAVS_RESPONSE_2(10,4)<4;4,1> + mov (4) uwDEST_Y(23,1)<4> uwAVS_RESPONSE_2(10,12)<4;4,1> + mov (4) uwDEST_Y(26,1)<4> uwAVS_RESPONSE_2(11,0)<4;4,1> + mov (4) uwDEST_Y(27,1)<4> uwAVS_RESPONSE_2(11,8)<4;4,1> + mov (4) uwDEST_Y(30,1)<4> uwAVS_RESPONSE_2(11,4)<4;4,1> + mov (4) uwDEST_Y(31,1)<4> uwAVS_RESPONSE_2(11,12)<4;4,1> + +// Move second 8x8 words of R to dest GRF + mov (4) uwDEST_Y(2,0)<4> uwAVS_RESPONSE_2(0,0)<4;4,1> + mov (4) uwDEST_Y(3,0)<4> uwAVS_RESPONSE_2(0,8)<4;4,1> + mov (4) uwDEST_Y(6,0)<4> uwAVS_RESPONSE_2(0,4)<4;4,1> + mov (4) uwDEST_Y(7,0)<4> uwAVS_RESPONSE_2(0,12)<4;4,1> + mov (4) uwDEST_Y(10,0)<4> uwAVS_RESPONSE_2(1,0)<4;4,1> + mov (4) uwDEST_Y(11,0)<4> uwAVS_RESPONSE_2(1,8)<4;4,1> + mov (4) uwDEST_Y(14,0)<4> uwAVS_RESPONSE_2(1,4)<4;4,1> + mov (4) uwDEST_Y(15,0)<4> uwAVS_RESPONSE_2(1,12)<4;4,1> + mov (4) uwDEST_Y(18,0)<4> uwAVS_RESPONSE_2(8,0)<4;4,1> + mov (4) uwDEST_Y(19,0)<4> uwAVS_RESPONSE_2(8,8)<4;4,1> + mov (4) uwDEST_Y(22,0)<4> uwAVS_RESPONSE_2(8,4)<4;4,1> + mov (4) uwDEST_Y(23,0)<4> uwAVS_RESPONSE_2(8,12)<4;4,1> + mov (4) uwDEST_Y(26,0)<4> uwAVS_RESPONSE_2(9,0)<4;4,1> + mov (4) uwDEST_Y(27,0)<4> uwAVS_RESPONSE_2(9,8)<4;4,1> + mov (4) uwDEST_Y(30,0)<4> uwAVS_RESPONSE_2(9,4)<4;4,1> + mov (4) uwDEST_Y(31,0)<4> uwAVS_RESPONSE_2(9,12)<4;4,1> + +// Move second 8x8 words of A to dest GRF + mov (4) uwDEST_Y(2,3)<4> uwAVS_RESPONSE_2(6,0)<4;4,1> + mov (4) uwDEST_Y(3,3)<4> uwAVS_RESPONSE_2(6,8)<4;4,1> + mov (4) uwDEST_Y(6,3)<4> uwAVS_RESPONSE_2(6,4)<4;4,1> + mov (4) uwDEST_Y(7,3)<4> uwAVS_RESPONSE_2(6,12)<4;4,1> + mov (4) uwDEST_Y(10,3)<4> uwAVS_RESPONSE_2(7,0)<4;4,1> + mov (4) uwDEST_Y(11,3)<4> uwAVS_RESPONSE_2(7,8)<4;4,1> + mov (4) uwDEST_Y(14,3)<4> uwAVS_RESPONSE_2(7,4)<4;4,1> + mov (4) uwDEST_Y(15,3)<4> uwAVS_RESPONSE_2(7,12)<4;4,1> + mov (4) uwDEST_Y(18,3)<4> uwAVS_RESPONSE_2(14,0)<4;4,1> + mov (4) uwDEST_Y(19,3)<4> uwAVS_RESPONSE_2(14,8)<4;4,1> + mov (4) uwDEST_Y(22,3)<4> uwAVS_RESPONSE_2(14,4)<4;4,1> + mov (4) uwDEST_Y(23,3)<4> uwAVS_RESPONSE_2(14,12)<4;4,1> + mov (4) uwDEST_Y(26,3)<4> uwAVS_RESPONSE_2(15,0)<4;4,1> + mov (4) uwDEST_Y(27,3)<4> uwAVS_RESPONSE_2(15,8)<4;4,1> + mov (4) uwDEST_Y(30,3)<4> uwAVS_RESPONSE_2(15,4)<4;4,1> + mov (4) uwDEST_Y(31,3)<4> uwAVS_RESPONSE_2(15,12)<4;4,1> + +#else /* OUTPUT_8_BIT */ +// Move first 8x8 words of B to dest GRF + mov (8) ubDEST_Y(0,2)<4> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) ubDEST_Y(2,2)<4> ubAVS_RESPONSE(4,8+1)<16;4,2> + mov (8) ubDEST_Y(4,2)<4> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) ubDEST_Y(6,2)<4> ubAVS_RESPONSE(5,8+1)<16;4,2> + mov (8) ubDEST_Y(8,2)<4> ubAVS_RESPONSE(12,1)<16;4,2> + mov (8) ubDEST_Y(10,2)<4> ubAVS_RESPONSE(12,8+1)<16;4,2> + mov (8) ubDEST_Y(12,2)<4> ubAVS_RESPONSE(13,1)<16;4,2> + mov (8) ubDEST_Y(14,2)<4> ubAVS_RESPONSE(13,8+1)<16;4,2> + +// Move first 8x8 words of G to dest GRF + mov (8) ubDEST_Y(0,1)<4> ubAVS_RESPONSE(2,1)<16;4,2> + mov (8) ubDEST_Y(2,1)<4> ubAVS_RESPONSE(2,8+1)<16;4,2> + mov (8) ubDEST_Y(4,1)<4> ubAVS_RESPONSE(3,1)<16;4,2> + mov (8) ubDEST_Y(6,1)<4> ubAVS_RESPONSE(3,8+1)<16;4,2> + mov (8) ubDEST_Y(8,1)<4> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) ubDEST_Y(10,1)<4> ubAVS_RESPONSE(10,8+1)<16;4,2> + mov (8) ubDEST_Y(12,1)<4> ubAVS_RESPONSE(11,1)<16;4,2> + mov (8) ubDEST_Y(14,1)<4> ubAVS_RESPONSE(11,8+1)<16;4,2> + +// Move first 8x8 words of R to dest GRF + mov (8) ubDEST_Y(0,0)<4> ubAVS_RESPONSE(0,1)<16;4,2> + mov (8) ubDEST_Y(2,0)<4> ubAVS_RESPONSE(0,8+1)<16;4,2> + mov (8) ubDEST_Y(4,0)<4> ubAVS_RESPONSE(1,1)<16;4,2> + mov (8) ubDEST_Y(6,0)<4> ubAVS_RESPONSE(1,8+1)<16;4,2> + mov (8) ubDEST_Y(8,0)<4> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) ubDEST_Y(10,0)<4> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) ubDEST_Y(12,0)<4> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) ubDEST_Y(14,0)<4> ubAVS_RESPONSE(9,8+1)<16;4,2> + +// Move first 8x8 words of A to dest GRF + mov (8) ubDEST_Y(0,3)<4> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) ubDEST_Y(2,3)<4> ubAVS_RESPONSE(6,8+1)<16;4,2> + mov (8) ubDEST_Y(4,3)<4> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) ubDEST_Y(6,3)<4> ubAVS_RESPONSE(7,8+1)<16;4,2> + mov (8) ubDEST_Y(8,3)<4> ubAVS_RESPONSE(14,1)<16;4,2> + mov (8) ubDEST_Y(10,3)<4> ubAVS_RESPONSE(14,8+1)<16;4,2> + mov (8) ubDEST_Y(12,3)<4> ubAVS_RESPONSE(15,1)<16;4,2> + mov (8) ubDEST_Y(14,3)<4> ubAVS_RESPONSE(15,8+1)<16;4,2> + +// Move second 8x8 words of B to dest GRF + mov (8) ubDEST_Y(1,2)<4> ubAVS_RESPONSE_2(4,1)<16;4,2> + mov (8) ubDEST_Y(3,2)<4> ubAVS_RESPONSE_2(4,8+1)<16;4,2> + mov (8) ubDEST_Y(5,2)<4> ubAVS_RESPONSE_2(5,1)<16;4,2> + mov (8) ubDEST_Y(7,2)<4> ubAVS_RESPONSE_2(5,8+1)<16;4,2> + mov (8) ubDEST_Y(9,2)<4> ubAVS_RESPONSE_2(12,1)<16;4,2> + mov (8) ubDEST_Y(11,2)<4> ubAVS_RESPONSE_2(12,8+1)<16;4,2> + mov (8) ubDEST_Y(13,2)<4> ubAVS_RESPONSE_2(13,1)<16;4,2> + mov (8) ubDEST_Y(15,2)<4> ubAVS_RESPONSE_2(13,8+1)<16;4,2> + +// Move second 8x8 words of G to dest GRF + mov (8) ubDEST_Y(1,1)<4> ubAVS_RESPONSE_2(2,1)<16;4,2> + mov (8) ubDEST_Y(3,1)<4> ubAVS_RESPONSE_2(2,8+1)<16;4,2> + mov (8) ubDEST_Y(5,1)<4> ubAVS_RESPONSE_2(3,1)<16;4,2> + mov (8) ubDEST_Y(7,1)<4> ubAVS_RESPONSE_2(3,8+1)<16;4,2> + mov (8) ubDEST_Y(9,1)<4> ubAVS_RESPONSE_2(10,1)<16;4,2> + mov (8) ubDEST_Y(11,1)<4> ubAVS_RESPONSE_2(10,8+1)<16;4,2> + mov (8) ubDEST_Y(13,1)<4> ubAVS_RESPONSE_2(11,1)<16;4,2> + mov (8) ubDEST_Y(15,1)<4> ubAVS_RESPONSE_2(11,8+1)<16;4,2> + +// Move second 8x8 words of R to dest GRF + mov (8) ubDEST_Y(1,0)<4> ubAVS_RESPONSE_2(0,1)<16;4,2> + mov (8) ubDEST_Y(3,0)<4> ubAVS_RESPONSE_2(0,8+1)<16;4,2> + mov (8) ubDEST_Y(5,0)<4> ubAVS_RESPONSE_2(1,1)<16;4,2> + mov (8) ubDEST_Y(7,0)<4> ubAVS_RESPONSE_2(1,8+1)<16;4,2> + mov (8) ubDEST_Y(9,0)<4> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) ubDEST_Y(11,0)<4> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) ubDEST_Y(13,0)<4> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) ubDEST_Y(15,0)<4> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + +// Move second 8x8 words of A to dest GRF + mov (8) ubDEST_Y(1,3)<4> ubAVS_RESPONSE_2(6,1)<16;4,2> + mov (8) ubDEST_Y(3,3)<4> ubAVS_RESPONSE_2(6,8+1)<16;4,2> + mov (8) ubDEST_Y(5,3)<4> ubAVS_RESPONSE_2(7,1)<16;4,2> + mov (8) ubDEST_Y(7,3)<4> ubAVS_RESPONSE_2(7,8+1)<16;4,2> + mov (8) ubDEST_Y(9,3)<4> ubAVS_RESPONSE_2(14,1)<16;4,2> + mov (8) ubDEST_Y(11,3)<4> ubAVS_RESPONSE_2(14,8+1)<16;4,2> + mov (8) ubDEST_Y(13,3)<4> ubAVS_RESPONSE_2(15,1)<16;4,2> + mov (8) ubDEST_Y(15,3)<4> ubAVS_RESPONSE_2(15,8+1)<16;4,2> +#endif +//------------------------------------------------------------------------------ + + // Set to write bottom region to memory + #define SRC_REGION REGION_2 + + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm new file mode 100644 index 0000000..b81923f --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_AVS_IEF_Unscramble_16x8.asm @@ -0,0 +1,260 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- RGB_AVS_IEF_Unpack_16x8.asm ---------- +#include "AVS_IEF.inc" + +.declare DEST_B Base=REG(r,10) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_G Base=REG(r,18) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_R Base=REG(r,26) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_A Base=REG(r,34) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw + + +#ifdef AVS_OUTPUT_16_BIT +//This portion will need to be changed if unpacking is required for Y416 kernels (in case of blending etc) - vK + +//// Move first 8x8 words of B to dest GRF (as packed) +// mov (4) uwDEST_Y(0,2)<4> uwAVS_RESPONSE(4,0)<4;4,1> +// mov (4) uwDEST_Y(1,2)<4> uwAVS_RESPONSE(4,8)<4;4,1> +// mov (4) uwDEST_Y(4,2)<4> uwAVS_RESPONSE(4,4)<4;4,1> +// mov (4) uwDEST_Y(5,2)<4> uwAVS_RESPONSE(4,12)<4;4,1> +// mov (4) uwDEST_Y(8,2)<4> uwAVS_RESPONSE(5,0)<4;4,1> +// mov (4) uwDEST_Y(9,2)<4> uwAVS_RESPONSE(5,8)<4;4,1> +// mov (4) uwDEST_Y(12,2)<4> uwAVS_RESPONSE(5,4)<4;4,1> +// mov (4) uwDEST_Y(13,2)<4> uwAVS_RESPONSE(5,12)<4;4,1> +// mov (4) uwDEST_Y(16,2)<4> uwAVS_RESPONSE(12,0)<4;4,1> +// mov (4) uwDEST_Y(17,2)<4> uwAVS_RESPONSE(12,8)<4;4,1> +// mov (4) uwDEST_Y(20,2)<4> uwAVS_RESPONSE(12,4)<4;4,1> +// mov (4) uwDEST_Y(21,2)<4> uwAVS_RESPONSE(12,12)<4;4,1> +// mov (4) uwDEST_Y(24,2)<4> uwAVS_RESPONSE(13,0)<4;4,1> +// mov (4) uwDEST_Y(25,2)<4> uwAVS_RESPONSE(13,8)<4;4,1> +// mov (4) uwDEST_Y(28,2)<4> uwAVS_RESPONSE(13,4)<4;4,1> +// mov (4) uwDEST_Y(29,2)<4> uwAVS_RESPONSE(13,12)<4;4,1> +// +//// Move first 8x8 words of G to dest GRF (as packed) +// mov (4) uwDEST_Y(0,1)<4> uwAVS_RESPONSE(2,0)<4;4,1> +// mov (4) uwDEST_Y(1,1)<4> uwAVS_RESPONSE(2,8)<4;4,1> +// mov (4) uwDEST_Y(4,1)<4> uwAVS_RESPONSE(2,4)<4;4,1> +// mov (4) uwDEST_Y(5,1)<4> uwAVS_RESPONSE(2,12)<4;4,1> +// mov (4) uwDEST_Y(8,1)<4> uwAVS_RESPONSE(3,0)<4;4,1> +// mov (4) uwDEST_Y(9,1)<4> uwAVS_RESPONSE(3,8)<4;4,1> +// mov (4) uwDEST_Y(12,1)<4> uwAVS_RESPONSE(3,4)<4;4,1> +// mov (4) uwDEST_Y(13,1)<4> uwAVS_RESPONSE(3,12)<4;4,1> +// mov (4) uwDEST_Y(16,1)<4> uwAVS_RESPONSE(10,0)<4;4,1> +// mov (4) uwDEST_Y(17,1)<4> uwAVS_RESPONSE(10,8)<4;4,1> +// mov (4) uwDEST_Y(20,1)<4> uwAVS_RESPONSE(10,4)<4;4,1> +// mov (4) uwDEST_Y(21,1)<4> uwAVS_RESPONSE(10,12)<4;4,1> +// mov (4) uwDEST_Y(24,1)<4> uwAVS_RESPONSE(11,0)<4;4,1> +// mov (4) uwDEST_Y(25,1)<4> uwAVS_RESPONSE(11,8)<4;4,1> +// mov (4) uwDEST_Y(28,1)<4> uwAVS_RESPONSE(11,4)<4;4,1> +// mov (4) uwDEST_Y(29,1)<4> uwAVS_RESPONSE(11,12)<4;4,1> +// +//// Move first 8x8 words of R to dest GRF (as packed) +// mov (4) uwDEST_Y(0,0)<4> uwAVS_RESPONSE(0,0)<4;4,1> +// mov (4) uwDEST_Y(1,0)<4> uwAVS_RESPONSE(0,8)<4;4,1> +// mov (4) uwDEST_Y(4,0)<4> uwAVS_RESPONSE(0,4)<4;4,1> +// mov (4) uwDEST_Y(5,0)<4> uwAVS_RESPONSE(0,12)<4;4,1> +// mov (4) uwDEST_Y(8,0)<4> uwAVS_RESPONSE(1,0)<4;4,1> +// mov (4) uwDEST_Y(9,0)<4> uwAVS_RESPONSE(1,8)<4;4,1> +// mov (4) uwDEST_Y(12,0)<4> uwAVS_RESPONSE(1,4)<4;4,1> +// mov (4) uwDEST_Y(13,0)<4> uwAVS_RESPONSE(1,12)<4;4,1> +// mov (4) uwDEST_Y(16,0)<4> uwAVS_RESPONSE(8,0)<4;4,1> +// mov (4) uwDEST_Y(17,0)<4> uwAVS_RESPONSE(8,8)<4;4,1> +// mov (4) uwDEST_Y(20,0)<4> uwAVS_RESPONSE(8,4)<4;4,1> +// mov (4) uwDEST_Y(21,0)<4> uwAVS_RESPONSE(8,12)<4;4,1> +// mov (4) uwDEST_Y(24,0)<4> uwAVS_RESPONSE(9,0)<4;4,1> +// mov (4) uwDEST_Y(25,0)<4> uwAVS_RESPONSE(9,8)<4;4,1> +// mov (4) uwDEST_Y(28,0)<4> uwAVS_RESPONSE(9,4)<4;4,1> +// mov (4) uwDEST_Y(29,0)<4> uwAVS_RESPONSE(9,12)<4;4,1> +// +//// Move first 8x8 words of A to dest GRF (as packed) +// mov (4) uwDEST_Y(0,3)<4> uwAVS_RESPONSE(6,0)<4;4,1> +// mov (4) uwDEST_Y(1,3)<4> uwAVS_RESPONSE(6,8)<4;4,1> +// mov (4) uwDEST_Y(4,3)<4> uwAVS_RESPONSE(6,4)<4;4,1> +// mov (4) uwDEST_Y(5,3)<4> uwAVS_RESPONSE(6,12)<4;4,1> +// mov (4) uwDEST_Y(8,3)<4> uwAVS_RESPONSE(7,0)<4;4,1> +// mov (4) uwDEST_Y(9,3)<4> uwAVS_RESPONSE(7,8)<4;4,1> +// mov (4) uwDEST_Y(12,3)<4> uwAVS_RESPONSE(7,4)<4;4,1> +// mov (4) uwDEST_Y(13,3)<4> uwAVS_RESPONSE(7,12)<4;4,1> +// mov (4) uwDEST_Y(16,3)<4> uwAVS_RESPONSE(14,0)<4;4,1> +// mov (4) uwDEST_Y(17,3)<4> uwAVS_RESPONSE(14,8)<4;4,1> +// mov (4) uwDEST_Y(20,3)<4> uwAVS_RESPONSE(14,4)<4;4,1> +// mov (4) uwDEST_Y(21,3)<4> uwAVS_RESPONSE(14,12)<4;4,1> +// mov (4) uwDEST_Y(24,3)<4> uwAVS_RESPONSE(15,0)<4;4,1> +// mov (4) uwDEST_Y(25,3)<4> uwAVS_RESPONSE(15,8)<4;4,1> +// mov (4) uwDEST_Y(28,3)<4> uwAVS_RESPONSE(15,4)<4;4,1> +// mov (4) uwDEST_Y(29,3)<4> uwAVS_RESPONSE(15,12)<4;4,1> +// +//// Move second 8x8 words of B to dest GRF +// mov (4) uwDEST_Y(2,2)<4> uwAVS_RESPONSE_2(4,0)<4;4,1> +// mov (4) uwDEST_Y(3,2)<4> uwAVS_RESPONSE_2(4,8)<4;4,1> +// mov (4) uwDEST_Y(6,2)<4> uwAVS_RESPONSE_2(4,4)<4;4,1> +// mov (4) uwDEST_Y(7,2)<4> uwAVS_RESPONSE_2(4,12)<4;4,1> +// mov (4) uwDEST_Y(10,2)<4> uwAVS_RESPONSE_2(5,0)<4;4,1> +// mov (4) uwDEST_Y(11,2)<4> uwAVS_RESPONSE_2(5,8)<4;4,1> +// mov (4) uwDEST_Y(14,2)<4> uwAVS_RESPONSE_2(5,4)<4;4,1> +// mov (4) uwDEST_Y(15,2)<4> uwAVS_RESPONSE_2(5,12)<4;4,1> +// mov (4) uwDEST_Y(18,2)<4> uwAVS_RESPONSE_2(12,0)<4;4,1> +// mov (4) uwDEST_Y(19,2)<4> uwAVS_RESPONSE_2(12,8)<4;4,1> +// mov (4) uwDEST_Y(22,2)<4> uwAVS_RESPONSE_2(12,4)<4;4,1> +// mov (4) uwDEST_Y(23,2)<4> uwAVS_RESPONSE_2(12,12)<4;4,1> +// mov (4) uwDEST_Y(26,2)<4> uwAVS_RESPONSE_2(13,0)<4;4,1> +// mov (4) uwDEST_Y(27,2)<4> uwAVS_RESPONSE_2(13,8)<4;4,1> +// mov (4) uwDEST_Y(30,2)<4> uwAVS_RESPONSE_2(13,4)<4;4,1> +// mov (4) uwDEST_Y(31,2)<4> uwAVS_RESPONSE_2(13,12)<4;4,1> +// +//// Move second 8x8 words of G to dest GRF +// mov (4) uwDEST_Y(2,1)<4> uwAVS_RESPONSE_2(2,0)<4;4,1> +// mov (4) uwDEST_Y(3,1)<4> uwAVS_RESPONSE_2(2,8)<4;4,1> +// mov (4) uwDEST_Y(6,1)<4> uwAVS_RESPONSE_2(2,4)<4;4,1> +// mov (4) uwDEST_Y(7,1)<4> uwAVS_RESPONSE_2(2,12)<4;4,1> +// mov (4) uwDEST_Y(10,1)<4> uwAVS_RESPONSE_2(3,0)<4;4,1> +// mov (4) uwDEST_Y(11,1)<4> uwAVS_RESPONSE_2(3,8)<4;4,1> +// mov (4) uwDEST_Y(14,1)<4> uwAVS_RESPONSE_2(3,4)<4;4,1> +// mov (4) uwDEST_Y(15,1)<4> uwAVS_RESPONSE_2(3,12)<4;4,1> +// mov (4) uwDEST_Y(18,1)<4> uwAVS_RESPONSE_2(10,0)<4;4,1> +// mov (4) uwDEST_Y(19,1)<4> uwAVS_RESPONSE_2(10,8)<4;4,1> +// mov (4) uwDEST_Y(22,1)<4> uwAVS_RESPONSE_2(10,4)<4;4,1> +// mov (4) uwDEST_Y(23,1)<4> uwAVS_RESPONSE_2(10,12)<4;4,1> +// mov (4) uwDEST_Y(26,1)<4> uwAVS_RESPONSE_2(11,0)<4;4,1> +// mov (4) uwDEST_Y(27,1)<4> uwAVS_RESPONSE_2(11,8)<4;4,1> +// mov (4) uwDEST_Y(30,1)<4> uwAVS_RESPONSE_2(11,4)<4;4,1> +// mov (4) uwDEST_Y(31,1)<4> uwAVS_RESPONSE_2(11,12)<4;4,1> +// +//// Move second 8x8 words of R to dest GRF +// mov (4) uwDEST_Y(2,0)<4> uwAVS_RESPONSE_2(0,0)<4;4,1> +// mov (4) uwDEST_Y(3,0)<4> uwAVS_RESPONSE_2(0,8)<4;4,1> +// mov (4) uwDEST_Y(6,0)<4> uwAVS_RESPONSE_2(0,4)<4;4,1> +// mov (4) uwDEST_Y(7,0)<4> uwAVS_RESPONSE_2(0,12)<4;4,1> +// mov (4) uwDEST_Y(10,0)<4> uwAVS_RESPONSE_2(1,0)<4;4,1> +// mov (4) uwDEST_Y(11,0)<4> uwAVS_RESPONSE_2(1,8)<4;4,1> +// mov (4) uwDEST_Y(14,0)<4> uwAVS_RESPONSE_2(1,4)<4;4,1> +// mov (4) uwDEST_Y(15,0)<4> uwAVS_RESPONSE_2(1,12)<4;4,1> +// mov (4) uwDEST_Y(18,0)<4> uwAVS_RESPONSE_2(8,0)<4;4,1> +// mov (4) uwDEST_Y(19,0)<4> uwAVS_RESPONSE_2(8,8)<4;4,1> +// mov (4) uwDEST_Y(22,0)<4> uwAVS_RESPONSE_2(8,4)<4;4,1> +// mov (4) uwDEST_Y(23,0)<4> uwAVS_RESPONSE_2(8,12)<4;4,1> +// mov (4) uwDEST_Y(26,0)<4> uwAVS_RESPONSE_2(9,0)<4;4,1> +// mov (4) uwDEST_Y(27,0)<4> uwAVS_RESPONSE_2(9,8)<4;4,1> +// mov (4) uwDEST_Y(30,0)<4> uwAVS_RESPONSE_2(9,4)<4;4,1> +// mov (4) uwDEST_Y(31,0)<4> uwAVS_RESPONSE_2(9,12)<4;4,1> +// +//// Move second 8x8 words of A to dest GRF +// mov (4) uwDEST_Y(2,3)<4> uwAVS_RESPONSE_2(6,0)<4;4,1> +// mov (4) uwDEST_Y(3,3)<4> uwAVS_RESPONSE_2(6,8)<4;4,1> +// mov (4) uwDEST_Y(6,3)<4> uwAVS_RESPONSE_2(6,4)<4;4,1> +// mov (4) uwDEST_Y(7,3)<4> uwAVS_RESPONSE_2(6,12)<4;4,1> +// mov (4) uwDEST_Y(10,3)<4> uwAVS_RESPONSE_2(7,0)<4;4,1> +// mov (4) uwDEST_Y(11,3)<4> uwAVS_RESPONSE_2(7,8)<4;4,1> +// mov (4) uwDEST_Y(14,3)<4> uwAVS_RESPONSE_2(7,4)<4;4,1> +// mov (4) uwDEST_Y(15,3)<4> uwAVS_RESPONSE_2(7,12)<4;4,1> +// mov (4) uwDEST_Y(18,3)<4> uwAVS_RESPONSE_2(14,0)<4;4,1> +// mov (4) uwDEST_Y(19,3)<4> uwAVS_RESPONSE_2(14,8)<4;4,1> +// mov (4) uwDEST_Y(22,3)<4> uwAVS_RESPONSE_2(14,4)<4;4,1> +// mov (4) uwDEST_Y(23,3)<4> uwAVS_RESPONSE_2(14,12)<4;4,1> +// mov (4) uwDEST_Y(26,3)<4> uwAVS_RESPONSE_2(15,0)<4;4,1> +// mov (4) uwDEST_Y(27,3)<4> uwAVS_RESPONSE_2(15,8)<4;4,1> +// mov (4) uwDEST_Y(30,3)<4> uwAVS_RESPONSE_2(15,4)<4;4,1> +// mov (4) uwDEST_Y(31,3)<4> uwAVS_RESPONSE_2(15,12)<4;4,1> + +#else /* OUTPUT_8_BIT */ + +// Move first 8x8 words of B to dest GRF + mov (8) DEST_B(0)<1> ubAVS_RESPONSE(4,1)<16;4,2> + mov (8) DEST_B(1)<1> ubAVS_RESPONSE(4,8+1)<16;4,2> + mov (8) DEST_B(2)<1> ubAVS_RESPONSE(5,1)<16;4,2> + mov (8) DEST_B(3)<1> ubAVS_RESPONSE(5,8+1)<16;4,2> + mov (8) DEST_B(4)<1> ubAVS_RESPONSE(12,1)<16;4,2> + mov (8) DEST_B(5)<1> ubAVS_RESPONSE(12,8+1)<16;4,2> + mov (8) DEST_B(6)<1> ubAVS_RESPONSE(13,1)<16;4,2> + mov (8) DEST_B(7)<1> ubAVS_RESPONSE(13,8+1)<16;4,2> + +// Move first 8x8 words of G to dest GRF + mov (8) DEST_G(0)<1> ubAVS_RESPONSE(2,1)<16;4,2> + mov (8) DEST_G(1)<1> ubAVS_RESPONSE(2,8+1)<16;4,2> + mov (8) DEST_G(2)<1> ubAVS_RESPONSE(3,1)<16;4,2> + mov (8) DEST_G(3)<1> ubAVS_RESPONSE(3,8+1)<16;4,2> + mov (8) DEST_G(4)<1> ubAVS_RESPONSE(10,1)<16;4,2> + mov (8) DEST_G(5)<1> ubAVS_RESPONSE(10,8+1)<16;4,2> + mov (8) DEST_G(6)<1> ubAVS_RESPONSE(11,1)<16;4,2> + mov (8) DEST_G(7)<1> ubAVS_RESPONSE(11,8+1)<16;4,2> + +// Move first 8x8 words of R to dest GRF + mov (8) DEST_R(0)<1> ubAVS_RESPONSE(0,1)<16;4,2> + mov (8) DEST_R(1)<1> ubAVS_RESPONSE(0,8+1)<16;4,2> + mov (8) DEST_R(2)<1> ubAVS_RESPONSE(1,1)<16;4,2> + mov (8) DEST_R(3)<1> ubAVS_RESPONSE(1,8+1)<16;4,2> + mov (8) DEST_R(4)<1> ubAVS_RESPONSE(8,1)<16;4,2> + mov (8) DEST_R(5)<1> ubAVS_RESPONSE(8,8+1)<16;4,2> + mov (8) DEST_R(6)<1> ubAVS_RESPONSE(9,1)<16;4,2> + mov (8) DEST_R(7)<1> ubAVS_RESPONSE(9,8+1)<16;4,2> + +// Move first 8x8 words of A to dest GRF + mov (8) DEST_A(0)<1> ubAVS_RESPONSE(6,1)<16;4,2> + mov (8) DEST_A(1)<1> ubAVS_RESPONSE(6,8+1)<16;4,2> + mov (8) DEST_A(2)<1> ubAVS_RESPONSE(7,1)<16;4,2> + mov (8) DEST_A(3)<1> ubAVS_RESPONSE(7,8+1)<16;4,2> + mov (8) DEST_A(4)<1> ubAVS_RESPONSE(14,1)<16;4,2> + mov (8) DEST_A(5)<1> ubAVS_RESPONSE(14,8+1)<16;4,2> + mov (8) DEST_A(6)<1> ubAVS_RESPONSE(15,1)<16;4,2> + mov (8) DEST_A(7)<1> ubAVS_RESPONSE(15,8+1)<16;4,2> + +// Move second 8x8 words of B to dest GRF + mov (8) DEST_B(0,8)<1> ubAVS_RESPONSE_2(4,1)<16;4,2> + mov (8) DEST_B(1,8)<1> ubAVS_RESPONSE_2(4,8+1)<16;4,2> + mov (8) DEST_B(2,8)<1> ubAVS_RESPONSE_2(5,1)<16;4,2> + mov (8) DEST_B(3,8)<1> ubAVS_RESPONSE_2(5,8+1)<16;4,2> + mov (8) DEST_B(4,8)<1> ubAVS_RESPONSE_2(12,1)<16;4,2> + mov (8) DEST_B(5,8)<1> ubAVS_RESPONSE_2(12,8+1)<16;4,2> + mov (8) DEST_B(6,8)<1> ubAVS_RESPONSE_2(13,1)<16;4,2> + mov (8) DEST_B(7,8)<1> ubAVS_RESPONSE_2(13,8+1)<16;4,2> + +// Move second 8x8 words of G to dest GRF + mov (8) DEST_G(0,8)<1> ubAVS_RESPONSE_2(2,1)<16;4,2> + mov (8) DEST_G(1,8)<1> ubAVS_RESPONSE_2(2,8+1)<16;4,2> + mov (8) DEST_G(2,8)<1> ubAVS_RESPONSE_2(3,1)<16;4,2> + mov (8) DEST_G(3,8)<1> ubAVS_RESPONSE_2(3,8+1)<16;4,2> + mov (8) DEST_G(4,8)<1> ubAVS_RESPONSE_2(10,1)<16;4,2> + mov (8) DEST_G(5,8)<1> ubAVS_RESPONSE_2(10,8+1)<16;4,2> + mov (8) DEST_G(6,8)<1> ubAVS_RESPONSE_2(11,1)<16;4,2> + mov (8) DEST_G(7,8)<1> ubAVS_RESPONSE_2(11,8+1)<16;4,2> + +// Move second 8x8 words of R to dest GRF + mov (8) DEST_R(0,8)<1> ubAVS_RESPONSE_2(0,1)<16;4,2> + mov (8) DEST_R(1,8)<1> ubAVS_RESPONSE_2(0,8+1)<16;4,2> + mov (8) DEST_R(2,8)<1> ubAVS_RESPONSE_2(1,1)<16;4,2> + mov (8) DEST_R(3,8)<1> ubAVS_RESPONSE_2(1,8+1)<16;4,2> + mov (8) DEST_R(4,8)<1> ubAVS_RESPONSE_2(8,1)<16;4,2> + mov (8) DEST_R(5,8)<1> ubAVS_RESPONSE_2(8,8+1)<16;4,2> + mov (8) DEST_R(6,8)<1> ubAVS_RESPONSE_2(9,1)<16;4,2> + mov (8) DEST_R(7,8)<1> ubAVS_RESPONSE_2(9,8+1)<16;4,2> + +// Move second 8x8 words of A to dest GRF + mov (8) DEST_A(0,8)<1> ubAVS_RESPONSE_2(6,1)<16;4,2> + mov (8) DEST_A(1,8)<1> ubAVS_RESPONSE_2(6,8+1)<16;4,2> + mov (8) DEST_A(2,8)<1> ubAVS_RESPONSE_2(7,1)<16;4,2> + mov (8) DEST_A(3,8)<1> ubAVS_RESPONSE_2(7,8+1)<16;4,2> + mov (8) DEST_A(4,8)<1> ubAVS_RESPONSE_2(14,1)<16;4,2> + mov (8) DEST_A(5,8)<1> ubAVS_RESPONSE_2(14,8+1)<16;4,2> + mov (8) DEST_A(6,8)<1> ubAVS_RESPONSE_2(15,1)<16;4,2> + mov (8) DEST_A(7,8)<1> ubAVS_RESPONSE_2(15,8+1)<16;4,2> +#endif +//------------------------------------------------------------------------------ + + // Set to write bottom region to memory + #define SRC_REGION REGION_1 + + // Re-define new # of lines + #undef nUV_NUM_OF_ROWS + #undef nY_NUM_OF_ROWS + + #define nY_NUM_OF_ROWS 8 + #define nUV_NUM_OF_ROWS 8 + diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm new file mode 100644 index 0000000..7429790 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/RGB_Scaling.asm @@ -0,0 +1,72 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +//---------- RGB_Scaling.asm ---------- +#include "Scaling.inc" + + // Build 16 elements ramp in float32 and normalized it +// mov (8) SAMPLER_RAMP(0)<1> 0x76543210:v +// add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f +mov (4) SAMPLER_RAMP(0)<1> 0x48403000:vf //3, 2, 1, 0 in float vector +mov (4) SAMPLER_RAMP(0,4)<1> 0x5C585450:vf //7, 6, 5, 4 in float vector +add (8) SAMPLER_RAMP(1)<1> SAMPLER_RAMP(0) 8.0:f + +//Module: PrepareScaleCoord.asm + + // Setup for sampler msg hdr + mov (2) rMSGSRC.0<1>:ud 0:ud { NoDDClr } // Unused fields + mov (1) rMSGSRC.2<1>:ud 0:ud { NoDDChk } // Write and offset + + // Calculate 16 v based on the step Y and vertical origin + mov (16) mfMSGPAYLOAD(2)<1> fSRC_VID_V_ORI<0;1,0>:f + mov (16) SCALE_COORD_Y<1>:f fSRC_VID_V_ORI<0;1,0>:f + + // Calculate 16 u based on the step X and hori origin +// line (16) mfMSGPAYLOAD(0)<1> SCALE_STEP_X<0;1,0>:f SAMPLER_RAMP(0) // Assign to mrf directly + mov (16) acc0:f fSRC_VID_H_ORI<0;1,0>:f { Compr } + mac (16) mfMSGPAYLOAD(0)<1> fVIDEO_STEP_X<0;1,0>:f SAMPLER_RAMP(0) { Compr } + + //Setup the constants for line instruction + mov (1) SCALE_LINE_P255<1>:f 255.0:f { NoDDClr } //{ NoDDClr, NoDDChk } + mov (1) SCALE_LINE_P0_5<1>:f 0.5:f { NoDDChk } + + +//------------------------------------------------------------------------------ + +$for (0; <nY_NUM_OF_ROWS; 1) { + + // Read 16 sampled pixels and store them in float32 in 8 GRFs in the order of BGRA (VYUA). + mov (8) MSGHDR_SCALE.0:ud rMSGSRC.0<8;8,1>:ud // Copy msg header and payload mirrors to MRFs + send (16) SCALE_RESPONSE_YW(0)<1> MSGHDR_SCALE udDUMMY_NULL nSMPL_ENGINE SMPLR_MSG_DSC+nSI_SRC_SIMD16_RGB+nBI_CURRENT_SRC_RGB + + // Calculate 16 v for next line + add (16) mfMSGPAYLOAD(2)<1> SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + add (16) SCALE_COORD_Y<1>:f SCALE_COORD_Y<8;8,1>:f fVIDEO_STEP_Y<0;1,0>:f // Assign to mrf directly + + // Scale back to [0, 255], convert f to ud + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(0) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(0)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(2) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(2)<1> acc0:f { Compr } + + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(4) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(4)<1> acc0:f { Compr } + +//#if defined(SAVE_ARGB) //Only needed if Alpha value is written to the destination + line (16) acc0:f SCALE_LINE_P255<0;1,0>:f SCALE_RESPONSE_YF(6) { Compr } // Process B, V + mov (16) SCALE_RESPONSE_YD(6)<1> acc0:f { Compr } +//#endif + + mov (16) DEST_R(%1)<1> SCALE_RESPONSE_YB(0) //possible error due to truncation - vK + mov (16) DEST_G(%1)<1> SCALE_RESPONSE_YB(2) //possible error due to truncation - vK + mov (16) DEST_B(%1)<1> SCALE_RESPONSE_YB(4) //possible error due to truncation - vK + mov (16) DEST_A(%1)<1> SCALE_RESPONSE_YB(6) //possible error due to truncation - vK +} diff --git a/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc b/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc new file mode 100644 index 0000000..bf66d4c --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Core_Kernels/Scaling.inc @@ -0,0 +1,75 @@ +/* + * All Video Processing kernels + * Copyright © <2010>, Intel Corporation. + * + * This program is licensed under the terms and conditions of the + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at + * http://www.opensource.org/licenses/eclipse-1.0.php. + * + */ + +// File name: Scaling.inc + +#ifndef _SCALING_INC_ +#define _SCALING_INC_ + +// Local variables---------------------------------------------------------------------------------- +#define MSGHDR_SCALE m1 // Message Payload Header (Uses m2, m3, m4, m5 implicitly) + +//-------------------------------------------------------------------------------------------------- +//r10.0 thru r33.0; Primary surface read from sampler (16x8) +#define DEST_Y uwTOP_Y +#define DEST_U uwTOP_U +#define DEST_V uwTOP_V + +//r10.0 thru r41.0 +.declare DEST_B Base=REG(r,10) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_G Base=REG(r,18) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_R Base=REG(r,26) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw +.declare DEST_A Base=REG(r,34) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw + +//r56.0 thru r79.0 +.declare SCALE_RESPONSE_YF Base=REG(r,nBOT_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=f +.declare SCALE_RESPONSE_UF Base=REG(r,nBOT_U) ElementSize=4 SrcRegion=REGION(8,1) Type=f +.declare SCALE_RESPONSE_VF Base=REG(r,nBOT_V) ElementSize=4 SrcRegion=REGION(8,1) Type=f + +.declare SCALE_RESPONSE_YW Base=REG(r,nBOT_Y) ElementSize=2 SrcRegion=REGION(16,1) Type=uw +.declare SCALE_RESPONSE_UW Base=REG(r,nBOT_U) ElementSize=2 SrcRegion=REGION(16,1) Type=uw +.declare SCALE_RESPONSE_VW Base=REG(r,nBOT_V) ElementSize=2 SrcRegion=REGION(16,1) Type=uw + +.declare SCALE_RESPONSE_YD Base=REG(r,nBOT_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare SCALE_RESPONSE_UD Base=REG(r,nBOT_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud +.declare SCALE_RESPONSE_VD Base=REG(r,nBOT_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud + +.declare SCALE_RESPONSE_YB Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub +.declare SCALE_RESPONSE_UB Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,4) Type=ub +.declare SCALE_RESPONSE_VB Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,4) Type=ub + +.declare SAMPLER_RAMP Base=REG(r,42) ElementSize=4 SrcRegion=<8;8,1> Type=f // 2 GRFs, 16 elements + +//#define SCALE_STEP_X REG2(r,43,0) +//#define SCALE_COORD_X REG2(r,43,3) + +#define SCALE_LINE_P255 REG2(r,43,4) // = 255.0 Used in 'line' inst to multiply 255, add 0.5, and round to int. +#define SCALE_LINE_P0_5 REG2(r,43,7) // = 0.5 + +//r44.0 thru r45.0 +#define SCALE_COORD_Y REG(r,44) //2GRF + + +// Send Message [DevILK] Message Descriptor +// MBZ MsgL=5 MsgR=8 H MBZ SIMD MsgType SmplrIndx BindTab +// 000 0 101 0 1000 1 0 10 0000 0000 00000000 +// 0 A 8 A 0 0 0 0 +// MsgL=1+2*2(u,v)=5 MsgR=8 +#define SMPLR_MSG_DSC 0x0A8A0000 // ILK Sampler Message Descriptor + +// Re-define new number of lines +#undef nY_NUM_OF_ROWS +#undef nUV_NUM_OF_ROWS + +#define nY_NUM_OF_ROWS 8 +#define nUV_NUM_OF_ROWS 8 + + +#endif //_SCALING_INC_ diff --git a/i965_drv_video/shaders/post_processing/Makefile.am b/i965_drv_video/shaders/post_processing/Makefile.am new file mode 100644 index 0000000..9f97eb0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/Makefile.am @@ -0,0 +1,28 @@ + +INTEL_G4I = + +INTEL_G4A = null.g4a + +INTEL_G4B = null.g4b + +INTEL_G4B_GEN5 = null.g4b.gen5 + +EXTRA_DIST = $(INTEL_G4I) \ + $(INTEL_G4A) \ + $(INTEL_G4B) \ + $(INTEL_G4B_GEN5) + +if HAVE_GEN4ASM + +SUFFIXES = .g4a .g4b +.g4a.g4b: + m4 $*.g4a > $*.g4m && intel-gen4asm -o $@ $*.g4m && intel-gen4asm -g 5 -o $@.gen5 $*.g4m && rm $*.g4m + +$(INTEL_G4B): $(INTEL_G4I) + +BUILT_SOURCES= $(INTEL_G4B) + +clean-local: + -rm -f $(INTEL_G4B) + -rm -f $(INTEL_G4B_GEN5) +endif diff --git a/i965_drv_video/shaders/post_processing/null.g4a b/i965_drv_video/shaders/post_processing/null.g4a new file mode 100644 index 0000000..cde124a --- /dev/null +++ b/i965_drv_video/shaders/post_processing/null.g4a @@ -0,0 +1,3 @@ +/* Just for test */ + +send(16) 0 acc0<1>UW g0<8,8,1>UW thread_spawner(0, 0, 0) mlen 1 rlen 0 {align1 EOT}; diff --git a/i965_drv_video/shaders/post_processing/null.g4b b/i965_drv_video/shaders/post_processing/null.g4b new file mode 100644 index 0000000..d8f28e7 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/null.g4b @@ -0,0 +1 @@ + { 0x00800031, 0x24001d28, 0x008d0000, 0x87100000 }, diff --git a/i965_drv_video/shaders/post_processing/null.g4b.gen5 b/i965_drv_video/shaders/post_processing/null.g4b.gen5 new file mode 100644 index 0000000..2bd0ba6 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/null.g4b.gen5 @@ -0,0 +1 @@ + { 0x00800031, 0x24001d28, 0x748d0000, 0x82000000 }, diff --git a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm new file mode 100644 index 0000000..80665e0 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.asm @@ -0,0 +1,19 @@ +// Module name: NV12_AVS_NV12 +.kernel NV12_AVS_NV12 +.code + +#define INC_SCALING + +#include "SetupVPKernel.asm" +#include "Multiple_Loop_Head.asm" +#include "PL2_AVS_IEF_16x8.asm" +#include "PL8x4_Save_NV12.asm" +#include "Multiple_Loop.asm" + +END_THREAD // End of Thread + +.end_code + +.end_kernel + +// end of nv12_avs_nv12.asm diff --git a/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5 new file mode 100644 index 0000000..b2a9e85 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_avs_nv12.g4b.gen5 @@ -0,0 +1,162 @@ + { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 }, + { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 }, + { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 }, + { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 }, + { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 }, + { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 }, + { 0x01000005, 0x20002d2c, 0x00000088, 0x80008000 }, + { 0x00010001, 0x20c003fd, 0x00000000, 0x00000000 }, + { 0x00000001, 0x212003bd, 0x000000c0, 0x00000000 }, + { 0x00000001, 0x212403bd, 0x000000bc, 0x00000000 }, + { 0x00000001, 0x213403bd, 0x00000038, 0x00000000 }, + { 0x00200001, 0x612803bd, 0x004500a4, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0000d000 }, + { 0x00802001, 0x20000022, 0x008d0100, 0x00000000 }, + { 0x00000031, 0x25401c09, 0x208d0000, 0x044bb401 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0000a000 }, + { 0x00802001, 0x20400022, 0x008d0100, 0x00000000 }, + { 0x02000031, 0x25c01c09, 0x208d0000, 0x048bb802 }, + { 0x00000001, 0x240803bc, 0x000000a4, 0x00000000 }, + { 0x00000048, 0x24087fbc, 0x000000bc, 0x41000000 }, + { 0x00000048, 0x21287fbd, 0x000000c0, 0x41e00000 }, + { 0x00000001, 0x240403bc, 0x000000bc, 0x00000000 }, + { 0x00000048, 0x21247fbd, 0x000000c0, 0x41000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0000d000 }, + { 0x00802001, 0x20000022, 0x008d0100, 0x00000000 }, + { 0x00000031, 0x27401c09, 0x208d0000, 0x044bb401 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0000a000 }, + { 0x00802001, 0x20400022, 0x008d0100, 0x00000000 }, + { 0x02000031, 0x27c01c09, 0x208d0000, 0x048bb802 }, + { 0x00600001, 0x21400229, 0x00aa0541, 0x00000000 }, + { 0x00600001, 0x21600229, 0x00aa0549, 0x00000000 }, + { 0x00600001, 0x21800229, 0x00aa0561, 0x00000000 }, + { 0x00600001, 0x21a00229, 0x00aa0569, 0x00000000 }, + { 0x00600001, 0x21c00229, 0x00aa0581, 0x00000000 }, + { 0x00600001, 0x21e00229, 0x00aa0589, 0x00000000 }, + { 0x00600001, 0x22000229, 0x00aa05a1, 0x00000000 }, + { 0x00600001, 0x22200229, 0x00aa05a9, 0x00000000 }, + { 0x00600001, 0x22400229, 0x00aa05c1, 0x00000000 }, + { 0x00600001, 0x22600229, 0x00aa05c9, 0x00000000 }, + { 0x00600001, 0x22800229, 0x00aa05e1, 0x00000000 }, + { 0x00600001, 0x22a00229, 0x00aa05e9, 0x00000000 }, + { 0x00600001, 0x22c00229, 0x00aa0641, 0x00000000 }, + { 0x00600001, 0x22e00229, 0x00aa0649, 0x00000000 }, + { 0x00600001, 0x23000229, 0x00aa0661, 0x00000000 }, + { 0x00600001, 0x23200229, 0x00aa0669, 0x00000000 }, + { 0x00600001, 0x23400229, 0x00aa0601, 0x00000000 }, + { 0x00600001, 0x23600229, 0x00aa0609, 0x00000000 }, + { 0x00600001, 0x23800229, 0x00aa0621, 0x00000000 }, + { 0x00600001, 0x23a00229, 0x00aa0629, 0x00000000 }, + { 0x00600001, 0x23c00229, 0x00aa0681, 0x00000000 }, + { 0x00600001, 0x23e00229, 0x00aa0689, 0x00000000 }, + { 0x00600001, 0x24000229, 0x00aa06a1, 0x00000000 }, + { 0x00600001, 0x24200229, 0x00aa06a9, 0x00000000 }, + { 0x00600001, 0x21500229, 0x00aa0741, 0x00000000 }, + { 0x00600001, 0x21700229, 0x00aa0749, 0x00000000 }, + { 0x00600001, 0x21900229, 0x00aa0761, 0x00000000 }, + { 0x00600001, 0x21b00229, 0x00aa0769, 0x00000000 }, + { 0x00600001, 0x21d00229, 0x00aa0781, 0x00000000 }, + { 0x00600001, 0x21f00229, 0x00aa0789, 0x00000000 }, + { 0x00600001, 0x22100229, 0x00aa07a1, 0x00000000 }, + { 0x00600001, 0x22300229, 0x00aa07a9, 0x00000000 }, + { 0x00600001, 0x22500229, 0x00aa07c1, 0x00000000 }, + { 0x00600001, 0x22700229, 0x00aa07c9, 0x00000000 }, + { 0x00600001, 0x22900229, 0x00aa07e1, 0x00000000 }, + { 0x00600001, 0x22b00229, 0x00aa07e9, 0x00000000 }, + { 0x00600001, 0x22d00229, 0x00aa0841, 0x00000000 }, + { 0x00600001, 0x22f00229, 0x00aa0849, 0x00000000 }, + { 0x00600001, 0x23100229, 0x00aa0861, 0x00000000 }, + { 0x00600001, 0x23300229, 0x00aa0869, 0x00000000 }, + { 0x00600001, 0x23500229, 0x00aa0801, 0x00000000 }, + { 0x00600001, 0x23700229, 0x00aa0809, 0x00000000 }, + { 0x00600001, 0x23900229, 0x00aa0821, 0x00000000 }, + { 0x00600001, 0x23b00229, 0x00aa0829, 0x00000000 }, + { 0x00600001, 0x23d00229, 0x00aa0881, 0x00000000 }, + { 0x00600001, 0x23f00229, 0x00aa0889, 0x00000000 }, + { 0x00600001, 0x24100229, 0x00aa08a1, 0x00000000 }, + { 0x00600001, 0x24300229, 0x00aa08a9, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x20280062, 0x00000000, 0x0007000f }, + { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff }, + { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff }, + { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 }, + { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 }, + { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 }, + { 0x00000801, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 }, + { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0007000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 }, + { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 }, + { 0x00710001, 0x24400169, 0x02000000, 0x00000000 }, + { 0x00000001, 0x24600061, 0x00000000, 0x00040001 }, + { 0x00000001, 0x24640061, 0x00000000, 0x00400010 }, + { 0x00000001, 0x24680061, 0x00000000, 0x04000100 }, + { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 }, + { 0x00000001, 0x26020128, 0x00000440, 0x00000000 }, + { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 }, + { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 }, + { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000442, 0x00000000 }, + { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000444, 0x00000000 }, + { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 }, + { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 }, + { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000446, 0x00000000 }, + { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000448, 0x00000000 }, + { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 }, + { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 }, + { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 }, + { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 }, + { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 }, + { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 }, + { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 }, + { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 }, + { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 }, + { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 }, + { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 }, + { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 }, + { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 }, + { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 }, + { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 }, + { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 }, + { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 }, + { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 }, + { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 }, + { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 }, + { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff }, + { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 }, + { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 }, + { 0x00000041, 0x24407fbd, 0x000000bc, 0x41800000 }, + { 0x00000040, 0x20a477bd, 0x00000440, 0x000000a4 }, + { 0x00010220, 0x34001c00, 0x00001400, 0x0000000e }, + { 0x00010220, 0x34001c00, 0x02001400, 0xfffffede }, + { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 }, + { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 }, + { 0x00000001, 0x20a403bd, 0x00000094, 0x00000000 }, + { 0x00000041, 0x24407fbd, 0x00000038, 0x41000000 }, + { 0x00000040, 0x20a877bd, 0x00000440, 0x000000a8 }, + { 0x00000220, 0x34001c00, 0x00001400, 0xfffffed2 }, + { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, diff --git a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm new file mode 100644 index 0000000..3ea9cea --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.asm @@ -0,0 +1,18 @@ +// Module name: NV12_DNDI_NV12 +.kernel NV12_DNDI_NV12 +.code + +#define INC_DNDI + +#include "SetupVPKernel.asm" +#include "Multiple_Loop_Head.asm" +#include "PL_DNDI_ALG_UVCopy_NV12.asm" +#include "Multiple_Loop.asm" + +END_THREAD // End of Thread + +.end_code + +.end_kernel + +// end of nv12_dndi_nv12.asm diff --git a/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5 new file mode 100644 index 0000000..1f60f3f --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_dndi_nv12.g4b.gen5 @@ -0,0 +1,86 @@ + { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 }, + { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 }, + { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 }, + { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 }, + { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 }, + { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00000001, 0x204801aa, 0x000000a0, 0x00000000 }, + { 0x00000001, 0x205801aa, 0x000000a2, 0x00000000 }, + { 0x01600031, 0x24400c01, 0x208d0000, 0x04cb8004 }, + { 0x00800001, 0x21400229, 0x00b10440, 0x00000000 }, + { 0x00800001, 0x21600229, 0x00b10450, 0x00000000 }, + { 0x00800001, 0x21800229, 0x00b10460, 0x00000000 }, + { 0x00800001, 0x21a00229, 0x00b10470, 0x00000000 }, + { 0x00600001, 0x22400229, 0x00ae0481, 0x00000000 }, + { 0x00600001, 0x23400229, 0x00ae0480, 0x00000000 }, + { 0x00600001, 0x22500229, 0x00ae0491, 0x00000000 }, + { 0x00600001, 0x23500229, 0x00ae0490, 0x00000000 }, + { 0x00600001, 0x22600229, 0x00ae04a1, 0x00000000 }, + { 0x00600001, 0x23600229, 0x00ae04a0, 0x00000000 }, + { 0x00600001, 0x22700229, 0x00ae04b1, 0x00000000 }, + { 0x00600001, 0x23700229, 0x00ae04b0, 0x00000000 }, + { 0x00800001, 0x21c00229, 0x00b104c0, 0x00000000 }, + { 0x00800001, 0x21e00229, 0x00b104d0, 0x00000000 }, + { 0x00800001, 0x22000229, 0x00b104e0, 0x00000000 }, + { 0x00800001, 0x22200229, 0x00b104f0, 0x00000000 }, + { 0x00600001, 0x22800229, 0x00ae0501, 0x00000000 }, + { 0x00600001, 0x23800229, 0x00ae0500, 0x00000000 }, + { 0x00600001, 0x22900229, 0x00ae0511, 0x00000000 }, + { 0x00600001, 0x23900229, 0x00ae0510, 0x00000000 }, + { 0x00600001, 0x22a00229, 0x00ae0521, 0x00000000 }, + { 0x00600001, 0x23a00229, 0x00ae0520, 0x00000000 }, + { 0x00600001, 0x22b00229, 0x00ae0531, 0x00000000 }, + { 0x00600001, 0x23b00229, 0x00ae0530, 0x00000000 }, + { 0x00000008, 0x21003da1, 0x000000a0, 0x00010001 }, + { 0x00000001, 0x210401a1, 0x000000a2, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x00030007 }, + { 0x00600001, 0x21600022, 0x008d0100, 0x00000000 }, + { 0x00600001, 0x21800022, 0x008d0540, 0x00000000 }, + { 0x0b600031, 0x20000c04, 0x508d0000, 0x04082014 }, + { 0x00200008, 0x21003da1, 0x004500a0, 0x00020002 }, + { 0x00000040, 0x21002421, 0x00000100, 0x00000034 }, + { 0x00000001, 0x21080061, 0x00000000, 0x00000003 }, + { 0x00600001, 0x21a00022, 0x008d0100, 0x00000000 }, + { 0x00000001, 0x21c00022, 0x00000560, 0x00000000 }, + { 0x0d600031, 0x20000c04, 0x508d0000, 0x04082014 }, + { 0x00400040, 0x22083e28, 0x00690024, 0x07000700 }, + { 0x01000010, 0x20003e2c, 0x0000003b, 0x00010001 }, + { 0x00010220, 0x34001c00, 0x00001400, 0x0000000a }, + { 0x00400001, 0x20400022, 0x00690580, 0x00000000 }, + { 0x00400001, 0x20500022, 0x006904d0, 0x00000000 }, + { 0x00400001, 0x20600022, 0x00690590, 0x00000000 }, + { 0x00400001, 0x20700022, 0x006904f0, 0x00000000 }, + { 0x00000220, 0x34001c00, 0x00001400, 0x00000008 }, + { 0x00400001, 0x20400022, 0x006904c0, 0x00000000 }, + { 0x00400001, 0x20500022, 0x00690580, 0x00000000 }, + { 0x00400001, 0x20600022, 0x006904e0, 0x00000000 }, + { 0x00400001, 0x20700022, 0x00690590, 0x00000000 }, + { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x06082007 }, + { 0x00200040, 0x210035a5, 0x004500a0, 0x00450074 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0001000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x28000c01, 0x408d0000, 0x0218a002 }, + { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0001000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00600001, 0x20400022, 0x008d0800, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x04082008 }, + { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff }, + { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 }, + { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 }, + { 0x00010220, 0x34001c00, 0x00001400, 0x00000008 }, + { 0x00010220, 0x34001c00, 0x02001400, 0xffffff70 }, + { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 }, + { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 }, + { 0x00000220, 0x34001c00, 0x00001400, 0xffffff6a }, + { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, diff --git a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm new file mode 100644 index 0000000..f234f83 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.asm @@ -0,0 +1,17 @@ +// Module name: NV12_LOAD_SAVE_NV12 +.kernel NV12_LOAD_SAVE_NV12 +.code + +#include "SetupVPKernel.asm" +#include "Multiple_Loop_Head.asm" +#include "NV12_Load_8x4.asm" +#include "PL8x4_Save_NV12.asm" +#include "Multiple_Loop.asm" + +END_THREAD // End of Thread + +.end_code + +.end_kernel + +// end of nv12_load_save_nv12.asm diff --git a/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5 new file mode 100644 index 0000000..9802ff2 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_load_save_nv12.g4b.gen5 @@ -0,0 +1,106 @@ + { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 }, + { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 }, + { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 }, + { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 }, + { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 }, + { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 }, + { 0x00200040, 0x210035a5, 0x004500a0, 0x00450074 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0007000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a001 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20400022, 0x008d0100, 0x00000000 }, + { 0x02600031, 0x28000c01, 0x408d0000, 0x0228a002 }, + { 0x00800001, 0x22200229, 0x00b10770, 0x00000000 }, + { 0x00800001, 0x22000229, 0x00b10760, 0x00000000 }, + { 0x00800001, 0x21e00229, 0x00b10750, 0x00000000 }, + { 0x00800001, 0x21c00229, 0x00b10740, 0x00000000 }, + { 0x00800001, 0x21a00229, 0x00b10730, 0x00000000 }, + { 0x00800001, 0x21800229, 0x00b10720, 0x00000000 }, + { 0x00800001, 0x21600229, 0x00b10710, 0x00000000 }, + { 0x00800001, 0x21400229, 0x00b10700, 0x00000000 }, + { 0x00800001, 0x22600229, 0x00d20820, 0x00000000 }, + { 0x00800001, 0x23600229, 0x00d20821, 0x00000000 }, + { 0x00800001, 0x22400229, 0x00d20800, 0x00000000 }, + { 0x00800001, 0x23400229, 0x00d20801, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x20280062, 0x00000000, 0x0007000f }, + { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff }, + { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff }, + { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 }, + { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 }, + { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 }, + { 0x00000801, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 }, + { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0007000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 }, + { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 }, + { 0x00710001, 0x24400169, 0x02000000, 0x00000000 }, + { 0x00000001, 0x24600061, 0x00000000, 0x00040001 }, + { 0x00000001, 0x24640061, 0x00000000, 0x00400010 }, + { 0x00000001, 0x24680061, 0x00000000, 0x04000100 }, + { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 }, + { 0x00000001, 0x26020128, 0x00000440, 0x00000000 }, + { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 }, + { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 }, + { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000442, 0x00000000 }, + { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000444, 0x00000000 }, + { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 }, + { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 }, + { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000446, 0x00000000 }, + { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000448, 0x00000000 }, + { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 }, + { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 }, + { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 }, + { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 }, + { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 }, + { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 }, + { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 }, + { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 }, + { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 }, + { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 }, + { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 }, + { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 }, + { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 }, + { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 }, + { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 }, + { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 }, + { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 }, + { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 }, + { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 }, + { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 }, + { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff }, + { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 }, + { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 }, + { 0x00010220, 0x34001c00, 0x00001400, 0x00000008 }, + { 0x00010220, 0x34001c00, 0x02001400, 0xffffff48 }, + { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 }, + { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 }, + { 0x00000220, 0x34001c00, 0x00001400, 0xffffff42 }, + { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, diff --git a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm new file mode 100644 index 0000000..d93d879 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.asm @@ -0,0 +1,20 @@ +// Module name: NV12_SCALING_NV12 +.kernel NV12_SCALING_NV12 +.code + +#define INC_SCALING + +#include "SetupVPKernel.asm" +#include "Multiple_Loop_Head.asm" +#include "PL2_Scaling.asm" +#include "PL16x8_PL8x4.asm" +#include "PL8x4_Save_NV12.asm" +#include "Multiple_Loop.asm" + +END_THREAD // End of Thread + +.end_code + +.end_kernel + +// end of nv12_scaling_nv12.asm diff --git a/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5 b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5 new file mode 100644 index 0000000..6e99720 --- /dev/null +++ b/i965_drv_video/shaders/post_processing/nv12_scaling_nv12.g4b.gen5 @@ -0,0 +1,222 @@ + { 0x00600001, 0x21000021, 0x008d0000, 0x00000000 }, + { 0x00000441, 0x20842e2d, 0x000000b7, 0x00100010 }, + { 0x00000c01, 0x2086022d, 0x000000bb, 0x00000000 }, + { 0x00000801, 0x208a01ad, 0x000000a0, 0x00000000 }, + { 0x00200001, 0x209403bd, 0x006600a4, 0x00000000 }, + { 0x00000040, 0x208435ad, 0x00000084, 0x000000a0 }, + { 0x00400001, 0x254002fd, 0x00000000, 0x48403000 }, + { 0x00400001, 0x255002fd, 0x00000000, 0x5c585450 }, + { 0x00600040, 0x25607fbd, 0x008d0540, 0x41000000 }, + { 0x00200401, 0x21000061, 0x00000000, 0x00000000 }, + { 0x00000801, 0x21080061, 0x00000000, 0x00000000 }, + { 0x00802001, 0x208003be, 0x000000a8, 0x00000000 }, + { 0x00802001, 0x258003bd, 0x000000a8, 0x00000000 }, + { 0x00802001, 0x240003bc, 0x000000a4, 0x00000000 }, + { 0x00802048, 0x204077be, 0x000000bc, 0x008d0540 }, + { 0x00000401, 0x257003fd, 0x00000000, 0x437f0000 }, + { 0x00000801, 0x257c03fd, 0x00000000, 0x3f000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21400229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22400229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23400229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21600229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22600229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23600229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21800229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22800229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23800229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21a00229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22a00229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23a00229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21c00229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22c00229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23c00229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x21e00229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x22e00229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x23e00229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x22000229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x23000229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x24000229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01800031, 0x27001c09, 0x208d0000, 0x0a8a0101 }, + { 0x01800031, 0x28001c09, 0x208d0000, 0x0a8a0202 }, + { 0x00802040, 0x208077be, 0x008d0580, 0x00000038 }, + { 0x00802040, 0x258077bd, 0x008d0580, 0x00000038 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0700 }, + { 0x00802001, 0x27000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0800 }, + { 0x00802001, 0x28000381, 0x00b10400, 0x00000000 }, + { 0x00802059, 0x240077bc, 0x00000570, 0x008d0840 }, + { 0x00802001, 0x28400381, 0x00b10400, 0x00000000 }, + { 0x00800001, 0x22200229, 0x00cf0700, 0x00000000 }, + { 0x00800001, 0x23200229, 0x00cf0800, 0x00000000 }, + { 0x00800001, 0x24200229, 0x00cf0840, 0x00000000 }, + { 0x00600001, 0x22400129, 0x00ae0240, 0x00000000 }, + { 0x00600001, 0x23400129, 0x00ae0340, 0x00000000 }, + { 0x00600001, 0x22500129, 0x00ae0280, 0x00000000 }, + { 0x00600001, 0x23500129, 0x00ae0380, 0x00000000 }, + { 0x00600001, 0x22600129, 0x00ae02c0, 0x00000000 }, + { 0x00600001, 0x23600129, 0x00ae03c0, 0x00000000 }, + { 0x00600001, 0x22700129, 0x00ae0300, 0x00000000 }, + { 0x00600001, 0x23700129, 0x00ae0400, 0x00000000 }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00200001, 0x202001a6, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x20280062, 0x00000000, 0x0007000f }, + { 0x00000005, 0x24000c20, 0x000000b8, 0x00ffffff }, + { 0x04000010, 0x20000c04, 0x00000400, 0x00ffffff }, + { 0x00010220, 0x34001c00, 0x00001400, 0x00000056 }, + { 0x01600031, 0x27000c01, 0x408d0000, 0x0248a007 }, + { 0x0000040c, 0x21043da1, 0x000000a2, 0x00010001 }, + { 0x00000801, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x01600031, 0x28000c01, 0x408d0000, 0x0228a008 }, + { 0x00200001, 0x210001a1, 0x004500a0, 0x00000000 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0007000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00000001, 0x26020228, 0x000000ba, 0x00000000 }, + { 0x00610001, 0x24400129, 0x028d00b8, 0x00000000 }, + { 0x00710001, 0x24400169, 0x02000000, 0x00000000 }, + { 0x00000001, 0x24600061, 0x00000000, 0x00040001 }, + { 0x00000001, 0x24640061, 0x00000000, 0x00400010 }, + { 0x00000001, 0x24680061, 0x00000000, 0x04000100 }, + { 0x00000001, 0x246c0061, 0x00000000, 0x40001000 }, + { 0x00000001, 0x26020128, 0x00000440, 0x00000000 }, + { 0x00910001, 0x41400231, 0x02b10700, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000440, 0x008d0460 }, + { 0x00710001, 0x42400231, 0x02ae0800, 0x00000000 }, + { 0x00710001, 0x43400231, 0x02ae0801, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000442, 0x00000000 }, + { 0x00910001, 0x41600231, 0x02b10710, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000444, 0x00000000 }, + { 0x00910001, 0x41800231, 0x02b10720, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000444, 0x008d0460 }, + { 0x00710001, 0x42500231, 0x02ae0810, 0x00000000 }, + { 0x00710001, 0x43500231, 0x02ae0811, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000446, 0x00000000 }, + { 0x00910001, 0x41a00231, 0x02b10730, 0x00000000 }, + { 0x00000001, 0x26020128, 0x00000448, 0x00000000 }, + { 0x00910001, 0x41c00231, 0x02b10740, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x02000448, 0x008d0460 }, + { 0x00710001, 0x42600231, 0x02ae0820, 0x00000000 }, + { 0x00710001, 0x43600231, 0x02ae0821, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044a, 0x00000000 }, + { 0x00910001, 0x41e00231, 0x02b10750, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044c, 0x00000000 }, + { 0x00910001, 0x42000231, 0x02b10760, 0x00000000 }, + { 0x02600005, 0x2000252c, 0x0200044c, 0x008d0460 }, + { 0x00710001, 0x42700231, 0x02ae0830, 0x00000000 }, + { 0x00710001, 0x43700231, 0x02ae0831, 0x00000000 }, + { 0x00000001, 0x26020128, 0x0000044e, 0x00000000 }, + { 0x00910001, 0x42200231, 0x02b10770, 0x00000000 }, + { 0x00800001, 0x20400232, 0x00d20140, 0x00000000 }, + { 0x00800001, 0x20500232, 0x00d20160, 0x00000000 }, + { 0x00800001, 0x20600232, 0x00d20180, 0x00000000 }, + { 0x00800001, 0x20700232, 0x00d201a0, 0x00000000 }, + { 0x00800001, 0x20800232, 0x00d201c0, 0x00000000 }, + { 0x00800001, 0x20900232, 0x00d201e0, 0x00000000 }, + { 0x00800001, 0x20a00232, 0x00d20200, 0x00000000 }, + { 0x00800001, 0x20b00232, 0x00d20220, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x0a082007 }, + { 0x00200001, 0x210001a5, 0x004500a0, 0x00000000 }, + { 0x0000000c, 0x21043ca5, 0x00000104, 0x00010001 }, + { 0x00000001, 0x21080061, 0x00000000, 0x0003000f }, + { 0x00600001, 0x20200022, 0x008d0100, 0x00000000 }, + { 0x00800001, 0x40400232, 0x00d20240, 0x00000000 }, + { 0x00800001, 0x40410232, 0x00d20340, 0x00000000 }, + { 0x00800001, 0x40600232, 0x00d20260, 0x00000000 }, + { 0x00800001, 0x40610232, 0x00d20360, 0x00000000 }, + { 0x01600031, 0x20000c04, 0x508d0000, 0x06082008 }, + { 0x01000040, 0x20863dad, 0x00000086, 0xffffffff }, + { 0x00000040, 0x20a03dad, 0x000000a0, 0x00100010 }, + { 0x05000010, 0x2000358c, 0x02210400, 0x00000084 }, + { 0x00000041, 0x24407fbd, 0x000000bc, 0x41800000 }, + { 0x00000040, 0x20a477bd, 0x00000440, 0x000000a4 }, + { 0x00010220, 0x34001c00, 0x00001400, 0x0000000e }, + { 0x00010220, 0x34001c00, 0x02001400, 0xfffffe66 }, + { 0x00000001, 0x20a001ad, 0x0000008a, 0x00000000 }, + { 0x00000040, 0x20a23dad, 0x000000a2, 0x00080008 }, + { 0x00000001, 0x20a403bd, 0x00000094, 0x00000000 }, + { 0x00000041, 0x24407fbd, 0x00000038, 0x41000000 }, + { 0x00000040, 0x20a877bd, 0x00000440, 0x000000a8 }, + { 0x00000220, 0x34001c00, 0x00001400, 0xfffffe5a }, + { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, + { 0x00600001, 0x21e00022, 0x008d0000, 0x00000000 }, + { 0x0f000031, 0x20000c04, 0x708d0000, 0x82000000 }, diff --git a/libva-tpi.pc.in b/libva-tpi.pc.in new file mode 100644 index 0000000..43616c0 --- /dev/null +++ b/libva-tpi.pc.in @@ -0,0 +1,11 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libva-tpi +Description: Userspace Video Acceleration (VA) 3rd party interface +Requires: libva +Version: @PACKAGE_VERSION@ +Libs: -L${libdir} -lva-tpi +Cflags: -I${includedir} @@ -145,6 +145,14 @@ typedef int VAStatus; /* Return status type from functions */ /* Color space conversion flags for vaPutSurface() */ #define VA_SRC_BT601 0x00000010 #define VA_SRC_BT709 0x00000020 +#define VA_SRC_SMPTE_240 0x00000040 + +/* Scaling flags for vaPutSurface() */ +#define VA_FILTER_SCALING_DEFAULT 0x00000000 +#define VA_FILTER_SCALING_FAST 0x00000100 +#define VA_FILTER_SCALING_HQ 0x00000200 +#define VA_FILTER_SCALING_NL_ANAMORPHIC 0x00000300 +#define VA_FILTER_SCALING_MASK 0x00000f00 /* * Returns a short english description of error_status @@ -1114,7 +1122,9 @@ typedef struct _VAEncPictureParameterBufferH264 VABufferID coded_buf; unsigned short picture_width; unsigned short picture_height; - unsigned char last_picture; /* if set to 1 it indicates the last picture in the sequence */ + unsigned char last_picture; /* if set to 1 it indicates the last picture in the sequence + * if set to 2 it indicates the last picture of the stream + */ } VAEncPictureParameterBufferH264; /**************************** @@ -1685,15 +1695,6 @@ typedef enum VADISPLAYATTRIB_BLE_NONE, } VADisplayAttribBLEMode; -typedef enum -{ - VADISPLAYATTRIB_CSC_FORMAT_YCC_BT601 = 0x00, - VADISPLAYATTRIB_CSC_FORMAT_YCC_BT709, - VADISPLAYATTRIB_CSC_FORMAT_YCC_SMPTE_240, - VADISPLAYATTRIB_CSC_FORMAT_RGB, - VADISPLAYATTRIB_CSC_FORMAT_NONE, -} VADisplayAttribCSCFormat; - /* attribute value for VADisplayAttribRotation */ #define VA_ROTATION_NONE 0x00000000 #define VA_ROTATION_90 0x00000001 diff --git a/va/x11/Makefile.am b/va/x11/Makefile.am index 2e3619c..31e381e 100644 --- a/va/x11/Makefile.am +++ b/va/x11/Makefile.am @@ -1,22 +1,24 @@ -# INTEL CONFIDENTIAL -# Copyright 2007 Intel Corporation. All Rights Reserved. +# Copyright (c) 2007 Intel Corporation. All Rights Reserved. # -# The source code contained or described herein and all documents related to -# the source code ("Material") are owned by Intel Corporation or its suppliers -# or licensors. Title to the Material remains with Intel Corporation or its -# suppliers and licensors. The Material may contain trade secrets and -# proprietary and confidential information of Intel Corporation and its -# suppliers and licensors, and is protected by worldwide copyright and trade -# secret laws and treaty provisions. No part of the Material may be used, -# copied, reproduced, modified, published, uploaded, posted, transmitted, -# distributed, or disclosed in any way without Intel's prior express written -# permission. +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sub license, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: # -# No license under any patent, copyright, trade secret or other intellectual -# property right is granted to or conferred upon you by disclosure or delivery -# of the Materials, either expressly, by implication, inducement, estoppel or -# otherwise. Any license under such intellectual property rights must be -# express and approved by Intel in writing. +# The above copyright notice and this permission notice (including the +# next paragraph) shall be included in all copies or substantial portions +# of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +# IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR +# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. AM_CFLAGS = -DLINUX -I$(top_srcdir)/va $(DRM_CFLAGS) |