summaryrefslogtreecommitdiff
path: root/backend/src/libocl/src/ocl_misc.cl
diff options
context:
space:
mode:
Diffstat (limited to 'backend/src/libocl/src/ocl_misc.cl')
-rw-r--r--backend/src/libocl/src/ocl_misc.cl1325
1 files changed, 1325 insertions, 0 deletions
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index bfa2fa71..ce139a6c 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -232,6 +232,1331 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
return val;
};
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_initialize(ushort2 src_coord,
+ uchar partition_mask,
+ uchar sad_adjustment){
+ intel_sub_group_avc_ime_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.partition_mask = partition_mask;
+ pl.sad_adjustment = sad_adjustment;
+ pl.ref_offset = (short2)(0, 0);
+ pl.search_window_config = 0;
+ pl.cc0 = 0;
+ pl.cc1 = 0;
+ pl.cc2 = 0;
+ pl.cc3 = 0;
+ pl.packed_cost_table = (uint2)(0, 0);
+ pl.cost_precision = 2;
+ pl.packed_shape_cost = 0;
+ return pl;
+}
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_single_reference(short2 ref_offset,
+ uchar search_window_config,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.ref_offset = ref_offset;
+ pl.search_window_config = search_window_config;
+ return pl;
+}
+
+intel_sub_group_avc_ime_result_t
+intel_sub_group_avc_ime_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ime_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+
+ short2 predict_mv = payload.ref_offset;
+ //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (payload.partition_mask << 24) | (0 << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+
+ /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/
+ src_grf1_dw7 = (payload.cost_precision << 16);
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //XXX: should set src_grf1_dw2
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ | (0 << 16) | (2 << 8) | (2);
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = (0 << 24) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = 0;
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = payload.packed_cost_table.s1;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = payload.packed_cost_table.s0;
+ //src_grf2_dw2 = ... Mode 8 Cost;
+ src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = payload.packed_shape_cost;
+ src_grf2_dw0 = 0;
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;
+
+ //XXX: TODO: set search path
+ src_grf4_dw7 = 0;
+ src_grf4_dw6 = 0;
+ src_grf4_dw5 = 0;
+ src_grf4_dw4 = 0;
+ src_grf4_dw3 = 0;
+ src_grf4_dw2 = 0;
+ src_grf4_dw1 = 0;
+ src_grf4_dw0 = 0;
+ src_grf5_dw7 = 0;
+ src_grf5_dw6 = 0;
+ src_grf5_dw5 = 0;
+ src_grf5_dw4 = 0;
+ src_grf5_dw3 = 0;
+ src_grf5_dw2 = 0;
+ src_grf5_dw1 = 0;
+ src_grf5_dw0 = 0;
+
+ intel_sub_group_avc_ime_result_t ime_result;
+ ime_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 2);
+
+ return ime_result;
+}
+
+ulong intel_sub_group_avc_ime_get_motion_vectors(intel_sub_group_avc_ime_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint fwd_mv, bwd_mv;
+ if(lid_x < 4){
+ fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2);
+ bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2);
+ }
+ else if(lid_x >= 4 && lid_x <= 12){
+ fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2);
+ }
+ else if(lid_x < 16){
+ fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2);
+ }
+
+ ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff);
+ return res;
+}
+
+ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_major_shape(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar major_shape = write_back_dw00 & 0x03;
+ return major_shape;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_minor_shapes(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6);
+ uchar minor_shape = (write_back_dw06 >> 8) & 0xff;
+ return minor_shape;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_directions(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6);
+ uchar direction = (write_back_dw06 >> 16) & 0xff;
+ return direction;
+}
+
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_fme_initialize(ushort2 src_coord,
+ ulong motion_vectors,
+ uchar major_shapes,
+ uchar minor_shapes,
+ uchar directions,
+ uchar pixel_resolution,
+ uchar sad_adjustment ){
+ intel_sub_group_avc_ref_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.mv = motion_vectors;
+ pl.major_shape = major_shapes;
+ pl.minor_shapes = minor_shapes;
+ pl.directions = directions;
+ pl.pixel_mode = pixel_resolution;
+ pl.sad_adjustment = sad_adjustment;
+#if REF_ENABLE_COST_PENALTY
+ pl.cc0 = 0;
+ pl.cc1 = 0;
+ pl.cc2 = 0;
+ pl.cc3 = 0;
+ pl.packed_cost_table = (uint2)(0, 0);
+ pl.cost_precision = 2;
+ pl.packed_shape_cost = 0;
+#endif
+ return pl;
+}
+
+intel_sub_group_avc_ref_result_t
+intel_sub_group_avc_ref_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ref_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (0 << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (payload.pixel_mode << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+
+ /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/
+ src_grf1_dw7 = 0;
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ //src_grf1_dw1 = (0 << 24) | (2);
+ src_grf1_dw1 = (0 << 24) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = (0 << 24) | (payload.directions << 16) | (payload.minor_shapes << 8) | (payload.major_shape);
+#if REF_ENABLE_COST_PENALTY
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = payload.packed_cost_table.s1;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = payload.packed_cost_table.s0;
+ //src_grf2_dw2 = ... Mode 8 Cost;
+ src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = payload.packed_shape_cost;
+ src_grf2_dw0 = 0;
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;
+#else
+ src_grf2_dw4 = 0;
+ src_grf2_dw3 = 0;
+ src_grf2_dw2 = 0;
+ src_grf2_dw1 = 0;
+ src_grf2_dw0 = 0;
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+#endif
+
+ //grf4...grf7 = Ref0/1 Sub-block XY 0...15
+ int2 bi_mv_temp = as_int2( payload.mv );
+ int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3);
+ src_grf4_dw7 = bi_mv.s1;
+ src_grf4_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2);
+ src_grf4_dw5 = bi_mv.s1;
+ src_grf4_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1);
+ src_grf4_dw3 = bi_mv.s1;
+ src_grf4_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0);
+ src_grf4_dw1 = bi_mv.s1;
+ src_grf4_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 7);
+ src_grf5_dw7 = bi_mv.s1;
+ src_grf5_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 6);
+ src_grf5_dw5 = bi_mv.s1;
+ src_grf5_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 5);
+ src_grf5_dw3 = bi_mv.s1;
+ src_grf5_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 4);
+ src_grf5_dw1 = bi_mv.s1;
+ src_grf5_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 11);
+ src_grf6_dw7 = bi_mv.s1;
+ src_grf6_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 10);
+ src_grf6_dw5 = bi_mv.s1;
+ src_grf6_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 9);
+ src_grf6_dw3 = bi_mv.s1;
+ src_grf6_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 8);
+ src_grf6_dw1 = bi_mv.s1;
+ src_grf6_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 15);
+ src_grf7_dw7 = bi_mv.s1;
+ src_grf7_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 14);
+ src_grf7_dw5 = bi_mv.s1;
+ src_grf7_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 13);
+ src_grf7_dw3 = bi_mv.s1;
+ src_grf7_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 12);
+ src_grf7_dw1 = bi_mv.s1;
+ src_grf7_dw0 = bi_mv.s0;
+
+ intel_sub_group_avc_ref_result_t ref_result;
+ ref_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 3);
+
+ return ref_result;
+}
+
+ulong intel_sub_group_avc_ref_get_motion_vectors(intel_sub_group_avc_ref_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint fwd_mv, bwd_mv;
+ if(lid_x < 4){
+ fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2);
+ bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2);
+ }
+ else if(lid_x >= 4 && lid_x <= 12){
+ fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2);
+ }
+ else if(lid_x < 16){
+ fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2);
+ }
+
+ ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff);
+ return res;
+}
+
+ushort intel_sub_group_avc_ref_get_inter_distortions(intel_sub_group_avc_ref_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uint2 intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(void){
+ #define COST_PENALTY(idx, base, shift) \
+ uchar cost_penalty_##idx = (shift << 4) | (base);
+
+ COST_PENALTY(0, 1, 0)
+ COST_PENALTY(1, 1, 0)
+ COST_PENALTY(2, 1, 0)
+ COST_PENALTY(3, 1, 0)
+ COST_PENALTY(4, 1, 0)
+ COST_PENALTY(5, 1, 0)
+ COST_PENALTY(6, 1, 0)
+ COST_PENALTY(7, 1, 0)
+ uint2 cost_table;
+ cost_table.s0 = cost_penalty_0 | (cost_penalty_1 << 8) | ( cost_penalty_2 << 16) | (cost_penalty_3 << 24);
+ cost_table.s1 = cost_penalty_4 | (cost_penalty_5 << 8) | ( cost_penalty_6 << 16) | (cost_penalty_7 << 24);
+ return cost_table;
+}
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.packed_cost_table = packed_cost_table;
+ pl.cost_precision = cost_precision;
+
+ uint lid_x = get_sub_group_local_id();
+ if(lid_x == 0)
+ pl.cc0 = packed_cost_center_delta;
+ else if(lid_x == 1)
+ pl.cc1 = packed_cost_center_delta;
+ else if(lid_x == 2)
+ pl.cc2 = packed_cost_center_delta;
+ else if(lid_x == 3)
+ pl.cc3 = packed_cost_center_delta;
+ else{
+ }
+ return pl;
+}
+
+#if REF_ENABLE_COST_PENALTY
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_ref_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ref_payload_t payload){
+ intel_sub_group_avc_ref_payload_t pl = payload;
+ pl.packed_cost_table = packed_cost_table;
+ pl.cost_precision = cost_precision;
+
+ uint lid_x = get_sub_group_local_id();
+ if(lid_x == 0)
+ pl.cc0 = packed_cost_center_delta;
+ else if(lid_x == 1)
+ pl.cc1 = packed_cost_center_delta;
+ else if(lid_x == 2)
+ pl.cc2 = packed_cost_center_delta;
+ else if(lid_x == 3)
+ pl.cc3 = packed_cost_center_delta;
+ else{
+ }
+ return pl;
+}
+
+#endif
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_inter_shape_penalty(ulong packed_shape_cost,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.packed_shape_cost = packed_shape_cost;
+ return pl;
+}
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_ipe(read_only image2d_t src_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y<<16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+ //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21)
+ //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7)
+ //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);
+ | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask);
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = 0;
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //cost related
+ src_grf2_dw7 = 0;
+ src_grf2_dw6 = 0;
+ src_grf2_dw5 = 0;
+ src_grf2_dw4 = 0;
+ src_grf2_dw3 = 0;
+ src_grf2_dw2 = 0;
+ src_grf2_dw1 = 0;
+ //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED);
+ src_grf2_dw0 = payload.intra_shape_cost;
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+
+ //Ref* SkipCenter* Delta XY
+ /*src_grf4_dw7 = Ref1_SkipCenter_3_Delta_XY;
+ src_grf4_dw6 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf4_dw5 = Ref1_SkipCenter_2_Delta_XY;
+ src_grf4_dw4 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf4_dw3 = Ref1_SkipCenter_1_Delta_XY;
+ src_grf4_dw2 = Ref0_SkipCenter_1_Delta_XY;
+ src_grf4_dw1 = Ref1_SkipCenter_0_Delta_XY;
+ src_grf4_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16) | (Ref0_Skip_Center_0_Delta_X);*/
+ src_grf4_dw7 = 0;
+ src_grf4_dw6 = 0;
+ src_grf4_dw5 = 0;
+ src_grf4_dw4 = 0;
+ src_grf4_dw3 = 0;
+ src_grf4_dw2 = 0;
+ src_grf4_dw1 = 0;
+ src_grf4_dw0 = 0;
+
+ //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+ src_grf5_dw7 = payload.ur_20_23;
+ //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+ src_grf5_dw6 = payload.ur_16_19;
+ //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+ src_grf5_dw5 = payload.u_12_15;
+ //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+ src_grf5_dw4 = payload.u_8_11;
+ //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+ src_grf5_dw3 = payload.u_4_7;
+ //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16)
+ //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]);
+ src_grf5_dw2 = payload.u_0_3;
+ uchar mode_mask_16_16 = 0xf;
+ ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff;
+ if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){
+ mode_mask_16_16 = 0;
+ mode_mask_8_8 = 0;
+ mode_mask_4_4 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){
+ mode_mask_16_16 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){
+ mode_mask_8_8 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){
+ mode_mask_4_4 = 0;
+ }
+ //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8)
+ //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask);
+ src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16);
+ //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask);
+ src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4);
+ //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC);
+ src_grf6_dw7 = 0;
+ //src_grf6_dw6 = Reserved;
+ src_grf6_dw6 = 0;
+ //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]);
+ src_grf6_dw5 = 0;
+ //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20)
+ //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8)
+ //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5);
+ //XXX: Which value should be set to?
+ src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20)
+ | (2 << 16) | (2 << 12) | (2 << 8)
+ | (2 << 4) | (2);
+ //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+ src_grf6_dw3 = payload.l_12_15;
+ //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+ src_grf6_dw2 = payload.l_8_11;
+ //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+ src_grf6_dw1 = payload.l_4_7;
+ //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16)
+ //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);
+ src_grf6_dw0 = payload.l_0_3;
+
+
+ //chroma related
+ src_grf7_dw7 = 0;
+ src_grf7_dw6 = 0;
+ src_grf7_dw5 = 0;
+ src_grf7_dw4 = 0;
+ src_grf7_dw3 = 0;
+ src_grf7_dw2 = 0;
+ src_grf7_dw1 = 0;
+ src_grf7_dw0 = 0;
+
+
+ intel_sub_group_avc_sic_result_t ime_result;
+ ime_result = __gen_ocl_ime(src_image, src_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 1);
+
+ return ime_result;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_initialize(ushort2 src_coord ){
+ intel_sub_group_avc_sic_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.intra_shape_cost = 0;
+ return pl;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_ipe(uchar luma_intra_partition_mask,
+ uchar intra_neighbour_availabilty,
+ uchar left_edge_luma_pixels,
+ uchar upper_left_corner_luma_pixel,
+ uchar upper_edge_luma_pixels,
+ uchar upper_right_edge_luma_pixels,
+ uchar intra_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload ){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.luma_intra_partition_mask = luma_intra_partition_mask;
+ pl.intra_neighbour_availabilty = intra_neighbour_availabilty;
+ uchar pixel[16];
+ for(uint i = 0; i < 16; i++)
+ pixel[i] = intel_sub_group_shuffle(left_edge_luma_pixels, i);
+
+ pl.l_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.l_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+ pl.l_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]);
+ pl.l_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]);
+
+ for(uint i = 0; i < 16; i++)
+ pixel[i] = intel_sub_group_shuffle(upper_edge_luma_pixels, i);
+ pl.u_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.u_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+ pl.u_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]);
+ pl.u_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]);
+
+ for(uint i = 0; i < 8; i++)
+ pixel[i] = intel_sub_group_shuffle(upper_right_edge_luma_pixels, i);
+ pl.ur_16_19 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.ur_20_23 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+
+ pl.upper_left_corner_luma_pixel = upper_left_corner_luma_pixel;
+ pl.intra_sad_adjustment = intra_sad_adjustment;
+ return pl;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_set_intra_luma_shape_penalty(uint packed_shape_cost,
+ intel_sub_group_avc_sic_payload_t payload ){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.intra_shape_cost = packed_shape_cost;
+ return pl;
+}
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.skip_sad_adjustment << 20) | (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ src_grf0_dw3 |= payload.skip_block_partition_type;
+ //Block-Based Skip Enabled
+ if(payload.skip_block_partition_type == CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL)
+ src_grf0_dw3 |= (1 << 19);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+ //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21)
+ //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7)
+ //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);
+ | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask);
+ src_grf1_dw7 |= payload.skip_motion_vector_mask;
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = (Cost_Center1Y << 16) | (Cost_Center1X);
+ src_grf1_dw4 = (Cost_Center0Y << 16) | (Cost_Center0X);
+ src_grf1_dw3 = (Ime_Too_Good << 24 ) | (Ime_Too_Bad << 16) | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = (0 << 24) | (payload.bidirectional_weight << 16) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 1;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = 0;
+ //XXX: TO DO: setting mv cost related bit filed
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = 0;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = 0;
+ //src_grf2_dw2 = (Chroma_Intra_Mode_Cost << 24) | (RefID_Cost << 16) | (Mode_9_Cost << 8) | (Mode_8_Cost);
+ src_grf2_dw2 = 0;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = 0;
+ //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED);
+ src_grf2_dw0 = payload.intra_shape_cost;
+ /*
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;*/
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+
+ //Ref1/Ref0 SkipCenter 3...0 Delta XY
+ int2 bi_mv_temp = as_int2( payload.mv );
+ int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3);
+ src_grf4_dw7 = bi_mv.s1;
+ src_grf4_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2);
+ src_grf4_dw5 = bi_mv.s1;
+ src_grf4_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1);
+ src_grf4_dw3 = bi_mv.s1;
+ src_grf4_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0);
+ src_grf4_dw1 = bi_mv.s1;
+ src_grf4_dw0 = bi_mv.s0;
+
+ //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+ src_grf5_dw7 = payload.ur_20_23;
+ //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+ src_grf5_dw6 = payload.ur_16_19;
+ //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+ src_grf5_dw5 = payload.u_12_15;
+ //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+ src_grf5_dw4 = payload.u_8_11;
+ //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+ src_grf5_dw3 = payload.u_4_7;
+ //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16)
+ //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]);
+ src_grf5_dw2 = payload.u_0_3;
+ uchar mode_mask_16_16 = 0xf;
+ ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff;
+ if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){
+ mode_mask_16_16 = 0;
+ mode_mask_8_8 = 0;
+ mode_mask_4_4 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){
+ mode_mask_16_16 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){
+ mode_mask_8_8 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){
+ mode_mask_4_4 = 0;
+ }
+ //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8)
+ //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask);
+ src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16);
+ //src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (0xb);
+ //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask);
+ src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4);
+ //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC);
+ src_grf6_dw7 = 0;
+ //src_grf6_dw6 = Reserved;
+ src_grf6_dw6 = 0;
+ //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]);
+ src_grf6_dw5 = 0;
+ //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20)
+ //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8)
+ //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5);
+ //XXX: Which value should be set to?
+ src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20)
+ | (2 << 16) | (2 << 12) | (2 << 8)
+ | (2 << 4) | (2);
+ //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+ src_grf6_dw3 = payload.l_12_15;
+ //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+ src_grf6_dw2 = payload.l_8_11;
+ //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+ src_grf6_dw1 = payload.l_4_7;
+ //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16)
+ //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);
+ src_grf6_dw0 = payload.l_0_3;
+
+
+ //chroma related
+ src_grf7_dw7 = 0;
+ src_grf7_dw6 = 0;
+ src_grf7_dw5 = 0;
+ src_grf7_dw4 = 0;
+ src_grf7_dw3 = 0;
+ src_grf7_dw2 = 0;
+ src_grf7_dw1 = 0;
+ src_grf7_dw0 = 0;
+
+
+ intel_sub_group_avc_ref_result_t sic_result;
+ sic_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 1);
+
+ return sic_result;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_skc(uint skip_block_partition_type,
+ uint skip_motion_vector_mask,
+ ulong motion_vectors,
+ char bidirectional_weight,
+ uchar skip_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.skip_block_partition_type = skip_block_partition_type;
+ pl.skip_motion_vector_mask = skip_motion_vector_mask;
+ pl.bidirectional_weight = bidirectional_weight;
+ pl.skip_sad_adjustment = skip_sad_adjustment;
+ pl.mv = motion_vectors;
+ return pl;
+}
+
+ushort
+intel_sub_group_avc_sic_get_inter_distortions(intel_sub_group_avc_sic_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uchar
+intel_sub_group_avc_sic_get_ipe_luma_shape(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar luma_shape = write_back_dw00 & 0x03;
+ return luma_shape;
+}
+
+ushort
+intel_sub_group_avc_sic_get_best_ipe_luma_distortion(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw03 = intel_sub_group_shuffle(result.s0, 3);
+ ushort luma_distortion = write_back_dw03;
+ return luma_distortion;
+}
+
+ulong intel_sub_group_avc_sic_get_packed_ipe_luma_modes(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar luma_shape = write_back_dw00 & 0x03;
+ ulong luma_modes = 0;
+ uint write_back_dw04 = intel_sub_group_shuffle(result.s0, 4);
+ uint write_back_dw05 = intel_sub_group_shuffle(result.s0, 5);
+ if(luma_shape == CLK_AVC_ME_INTRA_16x16_INTEL)
+ luma_modes |= (write_back_dw04 & 0x03);
+ else if(luma_shape == CLK_AVC_ME_INTRA_8x8_INTEL){
+ ulong modes_temp = write_back_dw04;
+ luma_modes = (modes_temp & 0x0f) | ((modes_temp & 0x00f0) << 12) | ((modes_temp & 0x0f00) << 24) | ((modes_temp & 0x0000f000) << 36);
+ }
+ else if(luma_shape == CLK_AVC_ME_INTRA_4x4_INTEL){
+ ulong modes_temp = write_back_dw05;
+ luma_modes = (modes_temp << 32) | (write_back_dw04 & 0x00000000ffffffff);
+ }
+ return luma_modes;
+}
+
bool __gen_ocl_in_local(size_t p) {
bool cond1 = p > 0;
bool cond2 = p < 64*1024;