From 9cb7ff4c285d892616595e5a43793f4d1408eca4 Mon Sep 17 00:00:00 2001 From: Chuanbo Weng Date: Wed, 14 Jun 2017 00:54:13 +0800 Subject: Implement extension cl_intel_device_side_avc_motion_estimation. This patch mainly contains: 1. built-in function __gen_ocl_ime implementation. 2. Lots of built-in functions of cl_intel_device_side_avc_motion_estimation are implemented. 3. This extension is required to run in simd16 mode. v2: move the utests to seprate patches one by one; as all the utests has extension function check, no need to put them in stand alone utest; uncomment the self test; fix extension check logic issue, should be && instead of ||. Signed-off-by: Chuanbo Weng Signed-off-by: Xionghu Luo Reviewed-by: Yang Rong --- backend/src/backend/gen/gen_mesa_disasm.c | 24 +- backend/src/backend/gen8_instruction.hpp | 15 + backend/src/backend/gen9_context.cpp | 105 ++ backend/src/backend/gen9_context.hpp | 1 + backend/src/backend/gen9_encoder.cpp | 46 + backend/src/backend/gen9_encoder.hpp | 9 + backend/src/backend/gen_context.cpp | 56 +- backend/src/backend/gen_context.hpp | 1 + backend/src/backend/gen_defs.hpp | 1 + backend/src/backend/gen_encoder.cpp | 8 + backend/src/backend/gen_encoder.hpp | 4 + .../src/backend/gen_insn_gen7_schedule_info.hxx | 1 + backend/src/backend/gen_insn_selection.cpp | 63 + backend/src/backend/gen_insn_selection.hpp | 12 +- backend/src/backend/gen_insn_selection.hxx | 1 + backend/src/ir/instruction.cpp | 56 + backend/src/ir/instruction.hpp | 14 +- backend/src/ir/instruction.hxx | 1 + backend/src/ir/liveness.cpp | 1 + backend/src/libocl/include/ocl_misc.h | 364 ++++++ backend/src/libocl/src/ocl_misc.cl | 1325 ++++++++++++++++++++ backend/src/llvm/llvm_gen_backend.cpp | 36 + backend/src/llvm/llvm_gen_ocl_function.hxx | 1 + backend/src/llvm/llvm_scalarize.cpp | 1 + src/cl_command_queue.c | 7 + src/cl_device_id.c | 4 + src/cl_extensions.c | 2 +- src/cl_extensions.h | 5 +- src/intel/intel_gpgpu.c | 70 ++ src/intel/intel_structs.h | 63 + utests/utest_helper.cpp | 18 + utests/utest_helper.hpp | 3 + 32 files changed, 2282 insertions(+), 36 deletions(-) diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 8a2afe58..ca36afa1 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -370,6 +370,7 @@ static const char *target_function_gen75[16] = { [GEN_SFID_DATAPORT_DATA] = "data (0)", [GEN_SFID_PIXEL_INTERPOLATOR] = "pix_interpolator", [GEN_SFID_DATAPORT1_DATA] = "data (1)", + [GEN_SFID_CHECK_REFINE] = "check_and_refine", }; static const char *gateway_sub_function[8] = { @@ -527,6 +528,13 @@ static int gen_version; bits; \ }) +#define GEN8_BITS_FIELD(inst, gen8) \ + ({ \ + int bits; \ + bits = ((const union Gen8NativeInstruction *)inst)->gen8; \ + bits; \ + }) + #define GEN_BITS_FIELD(inst, gen) \ ({ \ int bits; \ @@ -583,6 +591,8 @@ static int gen_version; #define BRANCH_UIP(inst) GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8) #define VME_BTI(inst) GEN7_BITS_FIELD(inst, bits3.vme_gen7.bti) #define VME_MSG_TYPE(inst) GEN7_BITS_FIELD(inst, bits3.vme_gen7.msg_type) +#define IME_BTI(inst) GEN8_BITS_FIELD(inst, bits3.ime_gen8.bti) +#define IME_MSG_TYPE(inst) GEN8_BITS_FIELD(inst, bits3.ime_gen8.msg_type) #define SAMPLE_BTI(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti) #define SAMPLER(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler) #define SAMPLER_MSG_TYPE(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type) @@ -1510,9 +1520,19 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac if (immbti) { switch (target) { case GEN_SFID_VIDEO_MOTION_EST: + if(gen_version == 7) + format(file, " (bti: %d, msg_type: %d)", + VME_BTI(inst), + VME_MSG_TYPE(inst)); + else if(gen_version == 9) + format(file, " (bti: %d, msg_type: %d)", + IME_BTI(inst), + IME_MSG_TYPE(inst)); + break; + case GEN_SFID_CHECK_REFINE: format(file, " (bti: %d, msg_type: %d)", - VME_BTI(inst), - VME_MSG_TYPE(inst)); + IME_BTI(inst), + IME_MSG_TYPE(inst)); break; case GEN_SFID_SAMPLER: format(file, " (%d, %d, %d, %d)", diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp index 446e7f9c..79e1b09b 100644 --- a/backend/src/backend/gen8_instruction.hpp +++ b/backend/src/backend/gen8_instruction.hpp @@ -430,6 +430,21 @@ union Gen8NativeInstruction uint32_t end_of_thread:1; } sampler_gen7; + struct { + uint32_t bti:8; + uint32_t pad0:5; + uint32_t msg_type:2; + uint32_t stream_out_enable:1; + uint32_t stream_in_enable:1; + uint32_t stream_out_enable2:1; + uint32_t pad1:1; + uint32_t header_present:1; + uint32_t response_length:5; + uint32_t msg_length:4; + uint32_t pad2:2; + uint32_t end_of_thread:1; + } ime_gen8; + /** * Message for the Sandybridge Sampler Cache or Constant Cache Data Port. * diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp index 2ce53b68..c81e42f2 100644 --- a/backend/src/backend/gen9_context.cpp +++ b/backend/src/backend/gen9_context.cpp @@ -62,6 +62,111 @@ namespace gbe } } + void Gen9Context::emitImeInstruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + const unsigned int msg_type = insn.extra.ime_msg_type; + + GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); + uint32_t execWidth_org = p->curr.execWidth; + int virt_pld_len; + int phi_pld_len = 0; + int virt_rsp_len; + +#define PHI_SIC_PAYLOAD_LEN 8 +#define PHI_IME_PAYLOAD_LEN 6 +#define PHI_VME_WRITEBACK_LEN 7 + + if(msg_type == 1 || msg_type == 2 || msg_type == 3) + virt_rsp_len = PHI_VME_WRITEBACK_LEN; + if(msg_type == 1 || msg_type == 3) + phi_pld_len = PHI_SIC_PAYLOAD_LEN; + else if(msg_type == 2) + phi_pld_len = PHI_IME_PAYLOAD_LEN; + if(execWidth_org == 8) + virt_pld_len = phi_pld_len; + else if(execWidth_org == 16) + virt_pld_len = (phi_pld_len + 1) / 2; + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + /* Now cl_intel_device_side_avc_motion_estimation is impelemented based on simd16 mode. + * So fall back to simd8 is not acceptable now. + * */ + GBE_ASSERT(execWidth_org == 16); + /* Use MOV to Setup bits of payload: mov payload value stored in insn.src(x) to + * consecutive payload grf. + * In simd8 mode, one virtual grf register map to one physical grf register. But + * in simd16 mode, one virtual grf register map to two physical grf registers. + * So we should treat them differently. + * */ + if(execWidth_org == 8){ + for(int i=0; i < virt_pld_len; i++){ + GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); + payload_grf.vstride = GEN_VERTICAL_STRIDE_0; + payload_grf.width = GEN_WIDTH_1; + payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; + payload_grf.subphysical = 1; + for(int j=0; j < 8; j++){ + payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD); + GenRegister payload_val = ra->genReg(insn.src(i*8+j)); + payload_val.vstride = GEN_VERTICAL_STRIDE_0; + payload_val.width = GEN_WIDTH_1; + payload_val.hstride = GEN_HORIZONTAL_STRIDE_0; + + p->MOV(payload_grf, payload_val); + } + } + } + else if(execWidth_org == 16){ + for(int i=0; i < virt_pld_len; i++){ + int nr_num = 2; + if( (i == virt_pld_len-1) && (phi_pld_len%2 == 1) ) + nr_num = 1; + for(int k = 0; k < nr_num; k++){ + GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); + payload_grf.nr += k; + payload_grf.vstride = GEN_VERTICAL_STRIDE_0; + payload_grf.width = GEN_WIDTH_1; + payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; + payload_grf.subphysical = 1; + for(int j=0; j < 8; j++){ + payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD); + GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j)); + payload_val.vstride = GEN_VERTICAL_STRIDE_0; + payload_val.width = GEN_WIDTH_1; + payload_val.hstride = GEN_HORIZONTAL_STRIDE_0; + + p->MOV(payload_grf, payload_val); + } + } + } + } + p->pop(); + +#undef PHI_SIC_PAYLOAD_LEN +#undef PHI_IME_PAYLOAD_LEN +#undef PHI_VME_WRITEBACK_LEN + + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(virt_rsp_len)), GEN_TYPE_UB); + payload_did.vstride = GEN_VERTICAL_STRIDE_0; + payload_did.width = GEN_WIDTH_1; + payload_did.hstride = GEN_HORIZONTAL_STRIDE_0; + payload_did.subphysical = 1; + payload_did.subnr = 20 * typeSize(GEN_TYPE_UB); + GenRegister grf0 = GenRegister::ub1grf(0, 20); + p->MOV(payload_did, grf0); + p->pop(); + + const GenRegister msgPayload = ra->genReg(insn.dst(virt_rsp_len)); + const unsigned char bti = insn.getbti(); + p->IME(bti, dst, msgPayload, msg_type); + } + void BxtContext::newSelection(void) { this->sel = GBE_NEW(SelectionBxt, *this); } diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp index 04766616..95a8ec32 100644 --- a/backend/src/backend/gen9_context.hpp +++ b/backend/src/backend/gen9_context.hpp @@ -37,6 +37,7 @@ namespace gbe : Gen8Context(unit, name, deviceID, relaxMath) { }; virtual void emitBarrierInstruction(const SelectionInstruction &insn); + virtual void emitImeInstruction(const SelectionInstruction &insn); protected: virtual GenEncoder* generateEncoder(void) { diff --git a/backend/src/backend/gen9_encoder.cpp b/backend/src/backend/gen9_encoder.cpp index b37fd981..cf6009a9 100644 --- a/backend/src/backend/gen9_encoder.cpp +++ b/backend/src/backend/gen9_encoder.cpp @@ -75,6 +75,52 @@ namespace gbe simd_mode, return_format); } + void Gen9Encoder::setImeMessage(GenNativeInstruction *insn, + unsigned char bti, + uint32_t response_length, + uint32_t msg_length, + uint32_t msg_type) + { + + GenMessageTarget sfid = GEN_SFID_NULL; + if(msg_type == 1 || msg_type == 3) + // 0Dh Check and Refinement Engine SFID_CRE SKL+ (SIC and FBR blong to SFID_CRE on SKL+) + sfid = GEN_SFID_CHECK_REFINE; + else if(msg_type == 2) + sfid = GEN_SFID_VIDEO_MOTION_EST; + setMessageDescriptor(insn, sfid, msg_length, response_length, true); + Gen8NativeInstruction *gen8_insn = &insn->gen8_insn; + gen8_insn->bits3.ime_gen8.bti = bti; + gen8_insn->bits3.ime_gen8.msg_type = msg_type; + gen8_insn->bits3.ime_gen8.stream_out_enable = 0; + gen8_insn->bits3.ime_gen8.stream_in_enable = 0; + gen8_insn->bits3.ime_gen8.stream_out_enable2 = 0; + + } + + void Gen9Encoder::IME(unsigned char bti, + GenRegister dest, + GenRegister msg, + uint32_t msg_type) + { + GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); + uint32_t msg_length, response_length; + if(msg_type == 1 || msg_type == 3){ + msg_length = 8; + response_length = 7; + } + if(msg_type == 2){ + msg_length = 6; + response_length = 7; + } + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + this->setHeader(insn); + this->setDst(insn, dest); + this->setSrc0(insn, msg); + this->setSrc1(insn, GenRegister::immud(0)); + setImeMessage(insn, bti, response_length, msg_length, msg_type); + } + void Gen9Encoder::setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1) { assert(dst.subnr == 0 && src0.subnr == 0 && src1.subnr == 0); diff --git a/backend/src/backend/gen9_encoder.hpp b/backend/src/backend/gen9_encoder.hpp index 2eaa5381..b8626495 100644 --- a/backend/src/backend/gen9_encoder.hpp +++ b/backend/src/backend/gen9_encoder.hpp @@ -47,6 +47,15 @@ namespace gbe uint32_t return_format, bool isLD, bool isUniform); + virtual void IME(unsigned char bti, + GenRegister dest, + GenRegister msg, + uint32_t msg_type); + void setImeMessage(GenNativeInstruction *insn, + unsigned char bti, + uint32_t response_length, + uint32_t msg_length, + uint32_t msg_type); void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1); virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends); virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends); diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 79a3e625..0b171ff9 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2339,10 +2339,20 @@ namespace gbe const unsigned int msg_type = insn.extra.msg_type; GBE_ASSERT(msg_type == 1); - int rsp_len; - if(msg_type == 1) - rsp_len = 6; uint32_t execWidth_org = p->curr.execWidth; + int virt_pld_len; + int virt_rsp_len; + +#define PHI_VME_PAYLOAD_LEN 5 +#define PHI_VME_WRITEBACK_LEN 6 + + if(msg_type == 1){ + virt_rsp_len = PHI_VME_WRITEBACK_LEN; + if(execWidth_org == 8) + virt_pld_len = PHI_VME_PAYLOAD_LEN; + else if(execWidth_org == 16) + virt_pld_len = (PHI_VME_PAYLOAD_LEN + 1) / 2; + } p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; @@ -2354,8 +2364,8 @@ namespace gbe * So we should treat them differently. * */ if(execWidth_org == 8){ - for(int i=0; i < 5; i++){ - GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i)); + for(int i=0; i < virt_pld_len; i++){ + GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); payload_grf.vstride = GEN_VERTICAL_STRIDE_0; payload_grf.width = GEN_WIDTH_1; payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; @@ -2372,9 +2382,12 @@ namespace gbe } } else if(execWidth_org == 16){ - for(int i=0; i < 2; i++){ - for(int k = 0; k < 2; k++){ - GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i)); + for(int i=0; i < virt_pld_len; i++){ + int nr_num = 2; + if( (i == virt_pld_len-1) && (PHI_VME_PAYLOAD_LEN%2 == 1) ) + nr_num = 1; + for(int k = 0; k < nr_num; k++){ + GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); payload_grf.nr += k; payload_grf.vstride = GEN_VERTICAL_STRIDE_0; payload_grf.width = GEN_WIDTH_1; @@ -2391,31 +2404,16 @@ namespace gbe } } } - { - int i = 2; - GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i)); - payload_grf.vstride = GEN_VERTICAL_STRIDE_0; - payload_grf.width = GEN_WIDTH_1; - payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; - payload_grf.subphysical = 1; - for(int j=0; j < 8; j++){ - payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD); - GenRegister payload_val = ra->genReg(insn.src(i*16+j)); - payload_val.vstride = GEN_VERTICAL_STRIDE_0; - payload_val.width = GEN_WIDTH_1; - payload_val.hstride = GEN_HORIZONTAL_STRIDE_0; - - p->MOV(payload_grf, payload_val); - } - } } p->pop(); +#undef PHI_VME_PAYLOAD_LEN +#undef PHI_VME_WRITEBACK_LEN p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; p->curr.execWidth = 1; - GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(rsp_len)), GEN_TYPE_UB); + GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(virt_rsp_len)), GEN_TYPE_UB); payload_did.vstride = GEN_VERTICAL_STRIDE_0; payload_did.width = GEN_WIDTH_1; payload_did.hstride = GEN_HORIZONTAL_STRIDE_0; @@ -2425,13 +2423,17 @@ namespace gbe p->MOV(payload_did, grf0); p->pop(); - const GenRegister msgPayload = ra->genReg(insn.dst(rsp_len)); + const GenRegister msgPayload = ra->genReg(insn.dst(virt_rsp_len)); const unsigned char bti = insn.getbti(); const unsigned int vme_search_path_lut = insn.extra.vme_search_path_lut; const unsigned int lut_sub = insn.extra.lut_sub; p->VME(bti, dst, msgPayload, msg_type, vme_search_path_lut, lut_sub); } + void GenContext::emitImeInstruction(const SelectionInstruction &insn) { + GBE_ASSERT(0); + } + void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { p->push(); uint32_t simdWidth = p->curr.execWidth; diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 7fd40d1b..fa24bfe5 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -178,6 +178,7 @@ namespace gbe void emitDWordGatherInstruction(const SelectionInstruction &insn); void emitSampleInstruction(const SelectionInstruction &insn); void emitVmeInstruction(const SelectionInstruction &insn); + virtual void emitImeInstruction(const SelectionInstruction &insn); void emitTypedWriteInstruction(const SelectionInstruction &insn); void emitSpillRegInstruction(const SelectionInstruction &insn); void emitUnSpillRegInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index c34e1bb6..90de946f 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -219,6 +219,7 @@ enum GenMessageTarget { GEN_SFID_DATAPORT_DATA = 10, GEN_SFID_PIXEL_INTERPOLATOR = 11, GEN_SFID_DATAPORT1_DATA = 12, /* New for HSW and BDW. */ + GEN_SFID_CHECK_REFINE = 13, /* New for SLK+*/ }; #define GEN_PREDICATE_NONE 0 diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 217a2d85..abd0d062 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -1276,6 +1276,14 @@ namespace gbe msg_type, vme_search_path_lut, lut_sub); } + void GenEncoder::IME(unsigned char bti, + GenRegister dest, + GenRegister msg, + uint32_t msg_type) + { + GBE_ASSERT(0); + } + void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool header_present, unsigned char bti, bool useSends) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index 040b94a2..fae8da1b 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -231,6 +231,10 @@ namespace gbe uint32_t msg_type, unsigned char vme_search_path_lut, unsigned char lut_sub); + virtual void IME(unsigned char bti, + GenRegister dest, + GenRegister msg, + uint32_t msg_type); virtual void FLUSH_SAMPLERCACHE(GenRegister dst); /*! TypedWrite instruction for texture */ diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx index c75557ca..d15547db 100644 --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx @@ -43,6 +43,7 @@ DECL_GEN7_SCHEDULE(PackLong, 40, 1, 1) DECL_GEN7_SCHEDULE(UnpackLong, 40, 1, 1) DECL_GEN7_SCHEDULE(Sample, 160, 1, 1) DECL_GEN7_SCHEDULE(Vme, 320, 1, 1) +DECL_GEN7_SCHEDULE(Ime, 320, 1, 1) DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1) DECL_GEN7_SCHEDULE(SpillReg, 20, 1, 1) DECL_GEN7_SCHEDULE(UnSpillReg, 160, 1, 1) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index c89a83e7..ea1cd5c0 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -193,6 +193,7 @@ namespace gbe this->opcode == SEL_OP_BYTE_GATHERA64 || this->opcode == SEL_OP_SAMPLE || this->opcode == SEL_OP_VME || + this->opcode == SEL_OP_IME || this->opcode == SEL_OP_DWORD_GATHER || this->opcode == SEL_OP_OBREAD || this->opcode == SEL_OP_MBREAD; @@ -740,6 +741,7 @@ namespace gbe void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform); /*! Encode vme instructions */ void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub); + void IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type); /*! Encode typed write instructions */ void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D); /*! Get image information */ @@ -2733,6 +2735,25 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp insn->extra.lut_sub = lut_sub; } + void Selection::Opaque::IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, + uint32_t dstNum, uint32_t srcNum, uint32_t msg_type) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_IME, dstNum, srcNum); + SelectionVector *dstVector = this->appendVector(); + + for (uint32_t elemID = 0; elemID < dstNum; ++elemID) + insn->dst(elemID) = dst[elemID]; + for (uint32_t elemID = 0; elemID < srcNum; ++elemID) + insn->src(elemID) = payloadVal[elemID]; + + dstVector->regNum = dstNum; + dstVector->isSrc = 0; + dstVector->offsetID = 0; + dstVector->reg = &insn->dst(0); + + insn->setbti(bti); + insn->extra.ime_msg_type = msg_type; + } + /////////////////////////////////////////////////////////////////////////// // Code selection public implementation /////////////////////////////////////////////////////////////////////////// @@ -7045,6 +7066,47 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp DECL_CTOR(VmeInstruction, 1, 1); }; + DECL_PATTERN(ImeInstruction) + { + INLINE bool emitOne(Selection::Opaque &sel, const ir::ImeInstruction &insn, bool &markChildren) const + { + using namespace ir; + uint32_t msg_type; + msg_type = insn.getMsgType(); + GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); + uint32_t payloadLen = 0; + if(msg_type == 2){ + payloadLen = 6; + } + else if(msg_type == 1 || msg_type == 3){ + payloadLen = 8; + } + uint32_t selDstNum = insn.getDstNum() + payloadLen; + uint32_t srcNum = insn.getSrcNum(); + vector dst(selDstNum); + vector payloadVal(srcNum); + uint32_t valueID = 0; + for (valueID = 0; valueID < insn.getDstNum(); ++valueID) + dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); + for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID) + dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + + for (valueID = 0; valueID < srcNum; ++valueID) + payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); + + uint32_t bti = insn.getImageIndex() + BTI_WORKAROUND_IMAGE_OFFSET; + if (bti > BTI_MAX_ID) { + std::cerr << "Too large bti " << bti; + return false; + } + + sel.IME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type); + + return true; + } + DECL_CTOR(ImeInstruction, 1, 1); + }; + /*! Typed write instruction pattern. */ DECL_PATTERN(TypedWriteInstruction) { @@ -8201,6 +8263,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp this->insert(); this->insert(); this->insert(); + this->insert(); this->insert(); this->insert(); this->insert(); diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index aa433883..664a9fad 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -96,8 +96,8 @@ namespace gbe const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; } /*! Set debug infomation to selection */ void setDBGInfo(DebugInfo in) { DBGInfo = in; } - /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */ - enum { MAX_SRC_NUM = 40 }; + /*! No more than 64 sources (48 sources are used by vme for payload passing and setting) */ + enum { MAX_SRC_NUM = 64 }; /*! No more than 17 destinations (17 used by image block read8) */ enum { MAX_DST_NUM = 17 }; /*! State of the instruction (extra fields neeed for the encoding) */ @@ -143,6 +143,10 @@ namespace gbe uint16_t vme_search_path_lut:3; uint16_t lut_sub:2; }; + struct { + uint16_t ime_bti:8; + uint16_t ime_msg_type:2; + }; uint32_t barrierType; uint32_t waitType; bool longjmp; @@ -172,7 +176,7 @@ namespace gbe /*! Number of destinations */ uint8_t dstNum:5; /*! Number of sources */ - uint8_t srcNum:6; + uint8_t srcNum:7; /*! To store various indices */ uint32_t index; /*! For BRC/IF to store the UIP */ @@ -192,6 +196,7 @@ namespace gbe case SEL_OP_DWORD_GATHER: return extra.function; case SEL_OP_SAMPLE: return extra.rdbti; case SEL_OP_VME: return extra.vme_bti; + case SEL_OP_IME: return extra.ime_bti; case SEL_OP_TYPED_WRITE: return extra.bti; default: GBE_ASSERT(0); @@ -209,6 +214,7 @@ namespace gbe case SEL_OP_DWORD_GATHER: extra.function = bti; return; case SEL_OP_SAMPLE: extra.rdbti = bti; return; case SEL_OP_VME: extra.vme_bti = bti; return; + case SEL_OP_IME: extra.ime_bti = bti; return; case SEL_OP_TYPED_WRITE: extra.bti = bti; return; default: GBE_ASSERT(0); diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index 5d96e9e4..24dd040a 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -72,6 +72,7 @@ DECL_SELECTION_IR(PACK_LONG, PackLongInstruction) DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction) DECL_SELECTION_IR(SAMPLE, SampleInstruction) DECL_SELECTION_IR(VME, VmeInstruction) +DECL_SELECTION_IR(IME, ImeInstruction) DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction) DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction) DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction) diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 48590fd1..75e1eec4 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -682,6 +682,50 @@ namespace ir { uint32_t dstNum; }; + class ALIGNED_INSTRUCTION ImeInstruction : + public BasePolicy, + public TupleSrcPolicy, + public TupleDstPolicy + { + public: + ImeInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, + uint32_t dstNum, uint32_t srcNum, int msg_type) { + this->opcode = OP_IME; + this->dst = dstTuple; + this->src = srcTuple; + this->dstNum = dstNum; + this->srcNum = srcNum; + this->imageIdx = imageIdx; + this->msg_type = msg_type; + } + INLINE bool wellFormed(const Function &fn, std::string &why) const; + INLINE void out(std::ostream &out, const Function &fn) const { + this->outOpcode(out); + out << " src_surface id " << (int)this->getImageIndex() + << " ref_surface id " << (int)this->getImageIndex() + 1; + for(uint32_t i = 0; i < dstNum; i++){ + out<< " %" << this->getDst(fn, i); + } + for(uint32_t i = 0; i < srcNum; i++){ + out<< " %" << this->getSrc(fn, i); + } + out + << " msg_type " << (int)this->getMsgType(); + } + Tuple src; + Tuple dst; + + INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } + INLINE uint8_t getMsgType(void) const { return this->msg_type; } + + INLINE Type getSrcType(void) const { return TYPE_U32; } + INLINE Type getDstType(void) const { return TYPE_U32; } + uint8_t imageIdx; + uint8_t msg_type; + uint32_t srcNum; + uint32_t dstNum; + }; + class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO public BasePolicy, @@ -1454,6 +1498,8 @@ namespace ir { { return true; } INLINE bool VmeInstruction::wellFormed(const Function &fn, std::string &why) const { return true; } + INLINE bool ImeInstruction::wellFormed(const Function &fn, std::string &why) const + { return true; } INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const { return true; } INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const @@ -2182,6 +2228,9 @@ END_INTROSPECTION(WaitInstruction) START_INTROSPECTION(VmeInstruction) #include "ir/instruction.hxx" END_INTROSPECTION(VmeInstruction) +START_INTROSPECTION(ImeInstruction) +#include "ir/instruction.hxx" +END_INTROSPECTION(ImeInstruction) START_INTROSPECTION(WorkGroupInstruction) #include "ir/instruction.hxx" @@ -2404,6 +2453,10 @@ DECL_MEM_FN(VmeInstruction, Type, getSrcType(void), getSrcType()) DECL_MEM_FN(VmeInstruction, Type, getDstType(void), getDstType()) DECL_MEM_FN(VmeInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(VmeInstruction, uint8_t, getMsgType(void), getMsgType()) +DECL_MEM_FN(ImeInstruction, Type, getSrcType(void), getSrcType()) +DECL_MEM_FN(ImeInstruction, Type, getDstType(void), getDstType()) +DECL_MEM_FN(ImeInstruction, uint8_t, getImageIndex(void), getImageIndex()) +DECL_MEM_FN(ImeInstruction, uint8_t, getMsgType(void), getMsgType()) DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType()) DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType()) DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) @@ -2709,6 +2762,9 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub) { return internal::VmeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type, vme_search_path_lut, lut_sub).convert(); } + Instruction IME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type) { + return internal::ImeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type).convert(); + } Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) { return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert(); diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 05c3e649..ab8bc1f9 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -217,7 +217,8 @@ namespace ir { return T::isClassOf(*this); } /*! max_src used by vme for payload passing and setting */ - static const uint32_t MAX_SRC_NUM = 40; + //static const uint32_t MAX_SRC_NUM = 48; + static const uint32_t MAX_SRC_NUM = 64; static const uint32_t MAX_DST_NUM = 32; DebugInfo DBGInfo; protected: @@ -429,6 +430,16 @@ namespace ir { static bool isClassOf(const Instruction &insn); }; + class ImeInstruction : public Instruction { + public: + uint8_t getImageIndex() const; + uint8_t getMsgType() const; + Type getSrcType(void) const; + Type getDstType(void) const; + /*! Return true if the given instruction is an instance of this class */ + static bool isClassOf(const Instruction &insn); + }; + typedef union _ImageInfoKey{ _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {}; _ImageInfoKey(int key) : data(key) {}; @@ -880,6 +891,7 @@ namespace ir { Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset); /*! video motion estimation */ Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub); + Instruction IME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type); /*! get image information , such as width/height/depth/... */ Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg); /*! label labelIndex */ diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index 81618eb9..2054b9c2 100644 --- a/backend/src/ir/instruction.hxx +++ b/backend/src/ir/instruction.hxx @@ -86,6 +86,7 @@ DECL_INSN(LABEL, LabelInstruction) DECL_INSN(READ_ARF, ReadARFInstruction) DECL_INSN(REGION, RegionInstruction) DECL_INSN(VME, VmeInstruction) +DECL_INSN(IME, ImeInstruction) DECL_INSN(INDIRECT_MOV, IndirectMovInstruction) DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction) DECL_INSN(MUL_HI, BinaryInstruction) diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp index dbb5c33f..1d385ee3 100644 --- a/backend/src/ir/liveness.cpp +++ b/backend/src/ir/liveness.cpp @@ -142,6 +142,7 @@ namespace ir { opCode != ir::OP_RHADD && opCode != ir::OP_READ_ARF && opCode != ir::OP_ADDSAT && + opCode != ir::OP_IME && (dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) && !extentRegs->contains(reg) ) diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h index d5fa589a..cb9e5bdd 100644 --- a/backend/src/libocl/include/ocl_misc.h +++ b/backend/src/libocl/include/ocl_misc.h @@ -19,6 +19,10 @@ #define __OCL_MISC_H__ #include "ocl_types.h" +#include "ocl_workitem.h" +#include "ocl_simd.h" +#include "ocl_printf.h" +#include "ocl_as.h" #define DEC2(TYPE, XTYPE, MASKTYPE) \ OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask); @@ -138,6 +142,232 @@ struct time_stamp { uint event; }; +//Interlaced image field polarity values: +#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +//Inter macro-block major shape values: +#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3 + +//Inter macro-block minor shape values: +#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3 + +//Inter macro-block major direction values: +#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +//Inter (IME) partition mask values: +#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +//Slice type values: +#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +//Search window configuration: +#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 + +//SAD adjustment mode: +#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +//Pixel resolution: +#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +//Cost precision values: +#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +//Inter bidirectional weights: +#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +//Inter border reached values: +#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +//Intra macro-block shape values: +#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2 + +//Inter skip block partition type: +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x04000 + +//Inter skip motion vector mask: +#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1<<24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ ENABLE_INTEL (0x2<<24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2<<24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1<<26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2<<26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1<<28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2<<28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1<<30) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2<<30) + +//Block based skip type values: +#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x0 +#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +//Luma intra partition mask values: +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +//Intra neighbor availability mask values: +#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +//Luma intra modes: +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 + +//Chroma intra modes: +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +//Reference image select values: +#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3 + +//VME media sampler initialization value: +#define CLK_AVC_ME_INITIALIZE_INTEL 0x0 + +//Default IME payload initialization: +#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL {0x0} + +//Default REF payload initialization: +#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL {0x0} + +//Default SIC payload initialization: +#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL {0x0} + +//Default IME result initialization: +#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL {0x0} + +//Default REF result initialization: +#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL {0x0} + +//Default SIC result initialization: +#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL {0x0} + +typedef struct{ + ushort2 srcCoord; + short2 ref_offset; + uchar partition_mask; + uchar sad_adjustment; + uchar search_window_config; + ulong cc0; + ulong cc1; + ulong cc2; + ulong cc3; + uint2 packed_cost_table; + uchar cost_precision; + ulong packed_shape_cost; +}intel_sub_group_avc_ime_payload_t; + +typedef uint8 intel_sub_group_avc_ime_result_t; + +#define REF_ENABLE_COST_PENALTY 1 + +typedef struct{ + ushort2 srcCoord; + long mv; + uchar major_shape; + uchar minor_shapes; + uchar directions; + uchar pixel_mode; + uchar sad_adjustment; +#if REF_ENABLE_COST_PENALTY + ulong cc0; + ulong cc1; + ulong cc2; + ulong cc3; + uint2 packed_cost_table; + uchar cost_precision; + ulong packed_shape_cost; +#endif +}intel_sub_group_avc_ref_payload_t; + +typedef struct{ + ushort2 srcCoord; + uint skip_block_partition_type; + uint skip_motion_vector_mask; + char bidirectional_weight; + uchar skip_sad_adjustment; + long mv; + + uchar luma_intra_partition_mask; + uchar intra_neighbour_availabilty; + uint l_0_3; + uint l_4_7; + uint l_8_11; + uint l_12_15; + uint u_0_3; + uint u_4_7; + uint u_8_11; + uint u_12_15; + uint ur_16_19; + uint ur_20_23; + uchar upper_left_corner_luma_pixel; + uchar intra_sad_adjustment; + uint intra_shape_cost; +}intel_sub_group_avc_sic_payload_t; + +typedef uint8 intel_sub_group_avc_ref_result_t; + +typedef uint8 intel_sub_group_avc_sic_result_t; + uint __gen_ocl_region(ushort offset, uint data); struct time_stamp __gen_ocl_get_timestamp(void); @@ -155,6 +385,140 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t, uint, uint, uint, uint, int, int, int); +intel_sub_group_avc_ime_result_t +__gen_ocl_ime(image2d_t, image2d_t, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + uint, uint, uint, uint, + int); + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_initialize(ushort2 src_coord, + uchar partition_mask, + uchar sad_adjustment); + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_single_reference(short2 ref_offset, + uchar search_window_config, + intel_sub_group_avc_ime_payload_t payload); + +intel_sub_group_avc_ime_result_t +intel_sub_group_avc_ime_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload); + +ulong intel_sub_group_avc_ime_get_motion_vectors(intel_sub_group_avc_ime_result_t result); + +ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result); + +ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result); + +uchar intel_sub_group_avc_ime_get_inter_major_shape(intel_sub_group_avc_ime_result_t result); + +uchar intel_sub_group_avc_ime_get_inter_minor_shapes(intel_sub_group_avc_ime_result_t result); + +uchar intel_sub_group_avc_ime_get_inter_directions(intel_sub_group_avc_ime_result_t result); + +intel_sub_group_avc_ref_payload_t +intel_sub_group_avc_fme_initialize(ushort2 src_coord, + ulong motion_vectors, + uchar major_shapes, + uchar minor_shapes, + uchar directions, + uchar pixel_resolution, + uchar sad_adjustment ); + +intel_sub_group_avc_ref_result_t +intel_sub_group_avc_ref_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_ref_payload_t payload); + +ulong intel_sub_group_avc_ref_get_motion_vectors(intel_sub_group_avc_ref_result_t result); + +ushort intel_sub_group_avc_ref_get_inter_distortions(intel_sub_group_avc_ref_result_t result); + +uint2 intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(void); + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_motion_vector_cost_function(ulong packed_cost_center_delta, + uint2 packed_cost_table, + uchar cost_precision, + intel_sub_group_avc_ime_payload_t payload); + +#if REF_ENABLE_COST_PENALTY +intel_sub_group_avc_ref_payload_t +intel_sub_group_avc_ref_set_motion_vector_cost_function(ulong packed_cost_center_delta, + uint2 packed_cost_table, + uchar cost_precision, + intel_sub_group_avc_ref_payload_t payload); +#endif + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_inter_shape_penalty(ulong packed_shape_cost, + intel_sub_group_avc_ime_payload_t payload); + +intel_sub_group_avc_sic_result_t +intel_sub_group_avc_sic_evaluate_ipe(read_only image2d_t src_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_initialize(ushort2 src_coord ); + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_configure_ipe(uchar luma_intra_partition_mask, + uchar intra_neighbour_availabilty, + uchar left_edge_luma_pixels, + uchar upper_left_corner_luma_pixel, + uchar upper_edge_luma_pixels, + uchar upper_right_edge_luma_pixels, + uchar intra_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload ); +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_set_intra_luma_shape_penalty(uint packed_shape_cost, + intel_sub_group_avc_sic_payload_t payload ); + +uchar +intel_sub_group_avc_sic_get_ipe_luma_shape(intel_sub_group_avc_sic_result_t result); + +ushort +intel_sub_group_avc_sic_get_best_ipe_luma_distortion(intel_sub_group_avc_sic_result_t result); + +ulong intel_sub_group_avc_sic_get_packed_ipe_luma_modes(intel_sub_group_avc_sic_result_t result); + + +intel_sub_group_avc_sic_result_t +intel_sub_group_avc_sic_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload); + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_configure_skc(uint skip_block_partition_type, + uint skip_motion_vector_mask, + ulong motion_vectors, + char bidirectional_weight, + uchar skip_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload); + +ushort +intel_sub_group_avc_sic_get_inter_distortions(intel_sub_group_avc_sic_result_t result); + bool __gen_ocl_in_local(size_t p); bool __gen_ocl_in_private(size_t p); diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl index bfa2fa71..ce139a6c 100644 --- a/backend/src/libocl/src/ocl_misc.cl +++ b/backend/src/libocl/src/ocl_misc.cl @@ -232,6 +232,1331 @@ struct time_stamp __gen_ocl_get_timestamp(void) { return val; }; +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_initialize(ushort2 src_coord, + uchar partition_mask, + uchar sad_adjustment){ + intel_sub_group_avc_ime_payload_t pl; + pl.srcCoord = src_coord; + pl.partition_mask = partition_mask; + pl.sad_adjustment = sad_adjustment; + pl.ref_offset = (short2)(0, 0); + pl.search_window_config = 0; + pl.cc0 = 0; + pl.cc1 = 0; + pl.cc2 = 0; + pl.cc3 = 0; + pl.packed_cost_table = (uint2)(0, 0); + pl.cost_precision = 2; + pl.packed_shape_cost = 0; + return pl; +} + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_single_reference(short2 ref_offset, + uchar search_window_config, + intel_sub_group_avc_ime_payload_t payload){ + intel_sub_group_avc_ime_payload_t pl = payload; + pl.ref_offset = ref_offset; + pl.search_window_config = search_window_config; + return pl; +} + +intel_sub_group_avc_ime_result_t +intel_sub_group_avc_ime_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_ime_payload_t payload){ + uint src_grf0_dw7; + uint src_grf0_dw6; + uint src_grf0_dw5; + uint src_grf0_dw4; + uint src_grf0_dw3; + uint src_grf0_dw2; + uint src_grf0_dw1; + uint src_grf0_dw0; + uint src_grf1_dw7; + uint src_grf1_dw6; + uint src_grf1_dw5; + uint src_grf1_dw4; + uint src_grf1_dw3; + uint src_grf1_dw2; + uint src_grf1_dw1; + uint src_grf1_dw0; + uint src_grf2_dw7; + uint src_grf2_dw6; + uint src_grf2_dw5; + uint src_grf2_dw4; + uint src_grf2_dw3; + uint src_grf2_dw2; + uint src_grf2_dw1; + uint src_grf2_dw0; + uint src_grf3_dw7; + uint src_grf3_dw6; + uint src_grf3_dw5; + uint src_grf3_dw4; + uint src_grf3_dw3; + uint src_grf3_dw2; + uint src_grf3_dw1; + uint src_grf3_dw0; + uint src_grf4_dw7; + uint src_grf4_dw6; + uint src_grf4_dw5; + uint src_grf4_dw4; + uint src_grf4_dw3; + uint src_grf4_dw2; + uint src_grf4_dw1; + uint src_grf4_dw0; + uint src_grf5_dw7; + uint src_grf5_dw6; + uint src_grf5_dw5; + uint src_grf5_dw4; + uint src_grf5_dw3; + uint src_grf5_dw2; + uint src_grf5_dw1; + uint src_grf5_dw0; + uint src_grf6_dw7; + uint src_grf6_dw6; + uint src_grf6_dw5; + uint src_grf6_dw4; + uint src_grf6_dw3; + uint src_grf6_dw2; + uint src_grf6_dw1; + uint src_grf6_dw0; + uint src_grf7_dw7; + uint src_grf7_dw6; + uint src_grf7_dw5; + uint src_grf7_dw4; + uint src_grf7_dw3; + uint src_grf7_dw2; + uint src_grf7_dw1; + uint src_grf7_dw0; + + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + + short2 predict_mv = payload.ref_offset; + //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id); + src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff); + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff); + + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (payload.partition_mask << 24) | (0 << 22) + //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) + | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18) + //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) + | (0 << 17) | (0 << 16) | (0 << 15) + //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) + | (0 << 14) | (0 << 12) | (0 << 11) + //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) + | (0 << 8) | (0 << 7) | (0 << 6) + //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) + | (0 << 4) | (0 << 3) | (0 << 2) + //| (Src_Size); + | (0); + + //src_grf0_dw2 = (SrcY << 16) | (SrcX); + src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x); + + /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21) + | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18) + | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7) + | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/ + src_grf1_dw7 = (payload.cost_precision << 16); + //src_grf1_dw6 = Reserved; + src_grf1_dw6 = 0; + /*src_grf1_dw5 = Reseverd for BDW+ + src_grf1_dw4 = Reseverd for BDW+*/ + src_grf1_dw5 = 0; + src_grf1_dw4 = 0; + //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15 + src_grf1_dw3 = 0; + //XXX: should set src_grf1_dw2 + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + | (0 << 16) | (2 << 8) | (2); + /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) + | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) + | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ + src_grf1_dw1 = (0 << 24) | (16); + /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) + | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) + | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) + | (Adaptive_En << 1) | (SkipModeEn);*/ + src_grf1_dw0 = 0; + + //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6] + src_grf2_dw7 = 0; + //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2] + src_grf2_dw6 = 0; + //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input); + src_grf2_dw5 = 0; + //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost; + src_grf2_dw4 = payload.packed_cost_table.s1; + //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost; + src_grf2_dw3 = payload.packed_cost_table.s0; + //src_grf2_dw2 = ... Mode 8 Cost; + src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff; + //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost + src_grf2_dw1 = payload.packed_shape_cost; + src_grf2_dw0 = 0; + //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ; + src_grf3_dw7 = payload.cc3 >> 32; + //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ; + src_grf3_dw6 = payload.cc3; + //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ; + src_grf3_dw5 = payload.cc2 >> 32; + //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ; + src_grf3_dw4 = payload.cc2; + //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ; + src_grf3_dw3 = payload.cc1 >> 32; + //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ; + src_grf3_dw2 = payload.cc1; + //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ; + src_grf3_dw1 = payload.cc0 >> 32; + //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ; + src_grf3_dw0 = payload.cc0; + + //XXX: TODO: set search path + src_grf4_dw7 = 0; + src_grf4_dw6 = 0; + src_grf4_dw5 = 0; + src_grf4_dw4 = 0; + src_grf4_dw3 = 0; + src_grf4_dw2 = 0; + src_grf4_dw1 = 0; + src_grf4_dw0 = 0; + src_grf5_dw7 = 0; + src_grf5_dw6 = 0; + src_grf5_dw5 = 0; + src_grf5_dw4 = 0; + src_grf5_dw3 = 0; + src_grf5_dw2 = 0; + src_grf5_dw1 = 0; + src_grf5_dw0 = 0; + + intel_sub_group_avc_ime_result_t ime_result; + ime_result = __gen_ocl_ime(src_image, ref_image, + src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, + src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, + src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4, + src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0, + src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4, + src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0, + src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4, + src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0, + src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4, + src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0, + src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4, + src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0, + src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4, + src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0, + src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4, + src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0, + //msg_type + 2); + + return ime_result; +} + +ulong intel_sub_group_avc_ime_get_motion_vectors(intel_sub_group_avc_ime_result_t result){ + uint lid_x = get_sub_group_local_id(); + uint fwd_mv, bwd_mv; + if(lid_x < 4){ + fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2); + bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2); + } + else if(lid_x >= 4 && lid_x <= 12){ + fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2); + bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2); + } + else if(lid_x < 16){ + fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2); + bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2); + } + + ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff); + return res; +} + +ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result){ + uint lid_x = get_sub_group_local_id(); + uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2); + int start_bit = lid_x%2 * 16; + ushort distortion = (write_back_dw >> start_bit); + return distortion; +} + +uchar intel_sub_group_avc_ime_get_inter_major_shape(intel_sub_group_avc_ime_result_t result){ + uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0); + uchar major_shape = write_back_dw00 & 0x03; + return major_shape; +} + +uchar intel_sub_group_avc_ime_get_inter_minor_shapes(intel_sub_group_avc_ime_result_t result){ + uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6); + uchar minor_shape = (write_back_dw06 >> 8) & 0xff; + return minor_shape; +} + +uchar intel_sub_group_avc_ime_get_inter_directions(intel_sub_group_avc_ime_result_t result){ + uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6); + uchar direction = (write_back_dw06 >> 16) & 0xff; + return direction; +} + +intel_sub_group_avc_ref_payload_t +intel_sub_group_avc_fme_initialize(ushort2 src_coord, + ulong motion_vectors, + uchar major_shapes, + uchar minor_shapes, + uchar directions, + uchar pixel_resolution, + uchar sad_adjustment ){ + intel_sub_group_avc_ref_payload_t pl; + pl.srcCoord = src_coord; + pl.mv = motion_vectors; + pl.major_shape = major_shapes; + pl.minor_shapes = minor_shapes; + pl.directions = directions; + pl.pixel_mode = pixel_resolution; + pl.sad_adjustment = sad_adjustment; +#if REF_ENABLE_COST_PENALTY + pl.cc0 = 0; + pl.cc1 = 0; + pl.cc2 = 0; + pl.cc3 = 0; + pl.packed_cost_table = (uint2)(0, 0); + pl.cost_precision = 2; + pl.packed_shape_cost = 0; +#endif + return pl; +} + +intel_sub_group_avc_ref_result_t +intel_sub_group_avc_ref_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_ref_payload_t payload){ + uint src_grf0_dw7; + uint src_grf0_dw6; + uint src_grf0_dw5; + uint src_grf0_dw4; + uint src_grf0_dw3; + uint src_grf0_dw2; + uint src_grf0_dw1; + uint src_grf0_dw0; + uint src_grf1_dw7; + uint src_grf1_dw6; + uint src_grf1_dw5; + uint src_grf1_dw4; + uint src_grf1_dw3; + uint src_grf1_dw2; + uint src_grf1_dw1; + uint src_grf1_dw0; + uint src_grf2_dw7; + uint src_grf2_dw6; + uint src_grf2_dw5; + uint src_grf2_dw4; + uint src_grf2_dw3; + uint src_grf2_dw2; + uint src_grf2_dw1; + uint src_grf2_dw0; + uint src_grf3_dw7; + uint src_grf3_dw6; + uint src_grf3_dw5; + uint src_grf3_dw4; + uint src_grf3_dw3; + uint src_grf3_dw2; + uint src_grf3_dw1; + uint src_grf3_dw0; + uint src_grf4_dw7; + uint src_grf4_dw6; + uint src_grf4_dw5; + uint src_grf4_dw4; + uint src_grf4_dw3; + uint src_grf4_dw2; + uint src_grf4_dw1; + uint src_grf4_dw0; + uint src_grf5_dw7; + uint src_grf5_dw6; + uint src_grf5_dw5; + uint src_grf5_dw4; + uint src_grf5_dw3; + uint src_grf5_dw2; + uint src_grf5_dw1; + uint src_grf5_dw0; + uint src_grf6_dw7; + uint src_grf6_dw6; + uint src_grf6_dw5; + uint src_grf6_dw4; + uint src_grf6_dw3; + uint src_grf6_dw2; + uint src_grf6_dw1; + uint src_grf6_dw0; + uint src_grf7_dw7; + uint src_grf7_dw6; + uint src_grf7_dw5; + uint src_grf7_dw4; + uint src_grf7_dw3; + uint src_grf7_dw2; + uint src_grf7_dw1; + uint src_grf7_dw0; + + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id); + src_grf0_dw5 = 0; + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (0 << 24) | (0 << 22) + //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) + | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18) + //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) + | (0 << 17) | (0 << 16) | (0 << 15) + //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) + | (0 << 14) | (payload.pixel_mode << 12) | (0 << 11) + //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) + | (0 << 8) | (0 << 7) | (0 << 6) + //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) + | (0 << 4) | (0 << 3) | (0 << 2) + //| (Src_Size); + | (0); + //src_grf0_dw2 = (SrcY << 16) | (SrcX); + src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + src_grf0_dw1 = 0; + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + src_grf0_dw0 = 0; + + + /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21) + | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18) + | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7) + | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/ + src_grf1_dw7 = 0; + //src_grf1_dw6 = Reserved; + src_grf1_dw6 = 0; + /*src_grf1_dw5 = Reseverd for BDW+ + src_grf1_dw4 = Reseverd for BDW+*/ + src_grf1_dw5 = 0; + src_grf1_dw4 = 0; + //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15 + src_grf1_dw3 = 0; + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + src_grf1_dw2 = 0; + /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) + | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) + | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ + //src_grf1_dw1 = (0 << 24) | (2); + src_grf1_dw1 = (0 << 24) | (16); + /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) + | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) + | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) + | (Adaptive_En << 1) | (SkipModeEn);*/ + src_grf1_dw0 = 0; + + //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6] + src_grf2_dw7 = 0; + //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2] + src_grf2_dw6 = 0; + //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input); + src_grf2_dw5 = (0 << 24) | (payload.directions << 16) | (payload.minor_shapes << 8) | (payload.major_shape); +#if REF_ENABLE_COST_PENALTY + //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost; + src_grf2_dw4 = payload.packed_cost_table.s1; + //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost; + src_grf2_dw3 = payload.packed_cost_table.s0; + //src_grf2_dw2 = ... Mode 8 Cost; + src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff; + //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost + src_grf2_dw1 = payload.packed_shape_cost; + src_grf2_dw0 = 0; + //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ; + src_grf3_dw7 = payload.cc3 >> 32; + //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ; + src_grf3_dw6 = payload.cc3; + //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ; + src_grf3_dw5 = payload.cc2 >> 32; + //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ; + src_grf3_dw4 = payload.cc2; + //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ; + src_grf3_dw3 = payload.cc1 >> 32; + //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ; + src_grf3_dw2 = payload.cc1; + //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ; + src_grf3_dw1 = payload.cc0 >> 32; + //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ; + src_grf3_dw0 = payload.cc0; +#else + src_grf2_dw4 = 0; + src_grf2_dw3 = 0; + src_grf2_dw2 = 0; + src_grf2_dw1 = 0; + src_grf2_dw0 = 0; + src_grf3_dw7 = 0; + src_grf3_dw6 = 0; + src_grf3_dw5 = 0; + src_grf3_dw4 = 0; + src_grf3_dw3 = 0; + src_grf3_dw2 = 0; + src_grf3_dw1 = 0; + src_grf3_dw0 = 0; +#endif + + //grf4...grf7 = Ref0/1 Sub-block XY 0...15 + int2 bi_mv_temp = as_int2( payload.mv ); + int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3); + src_grf4_dw7 = bi_mv.s1; + src_grf4_dw6 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2); + src_grf4_dw5 = bi_mv.s1; + src_grf4_dw4 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1); + src_grf4_dw3 = bi_mv.s1; + src_grf4_dw2 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0); + src_grf4_dw1 = bi_mv.s1; + src_grf4_dw0 = bi_mv.s0; + + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 7); + src_grf5_dw7 = bi_mv.s1; + src_grf5_dw6 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 6); + src_grf5_dw5 = bi_mv.s1; + src_grf5_dw4 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 5); + src_grf5_dw3 = bi_mv.s1; + src_grf5_dw2 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 4); + src_grf5_dw1 = bi_mv.s1; + src_grf5_dw0 = bi_mv.s0; + + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 11); + src_grf6_dw7 = bi_mv.s1; + src_grf6_dw6 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 10); + src_grf6_dw5 = bi_mv.s1; + src_grf6_dw4 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 9); + src_grf6_dw3 = bi_mv.s1; + src_grf6_dw2 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 8); + src_grf6_dw1 = bi_mv.s1; + src_grf6_dw0 = bi_mv.s0; + + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 15); + src_grf7_dw7 = bi_mv.s1; + src_grf7_dw6 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 14); + src_grf7_dw5 = bi_mv.s1; + src_grf7_dw4 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 13); + src_grf7_dw3 = bi_mv.s1; + src_grf7_dw2 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 12); + src_grf7_dw1 = bi_mv.s1; + src_grf7_dw0 = bi_mv.s0; + + intel_sub_group_avc_ref_result_t ref_result; + ref_result = __gen_ocl_ime(src_image, ref_image, + src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, + src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, + src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4, + src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0, + src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4, + src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0, + src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4, + src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0, + src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4, + src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0, + src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4, + src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0, + src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4, + src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0, + src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4, + src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0, + //msg_type + 3); + + return ref_result; +} + +ulong intel_sub_group_avc_ref_get_motion_vectors(intel_sub_group_avc_ref_result_t result){ + uint lid_x = get_sub_group_local_id(); + uint fwd_mv, bwd_mv; + if(lid_x < 4){ + fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2); + bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2); + } + else if(lid_x >= 4 && lid_x <= 12){ + fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2); + bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2); + } + else if(lid_x < 16){ + fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2); + bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2); + } + + ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff); + return res; +} + +ushort intel_sub_group_avc_ref_get_inter_distortions(intel_sub_group_avc_ref_result_t result){ + uint lid_x = get_sub_group_local_id(); + uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2); + int start_bit = lid_x%2 * 16; + ushort distortion = (write_back_dw >> start_bit); + return distortion; +} + +uint2 intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(void){ + #define COST_PENALTY(idx, base, shift) \ + uchar cost_penalty_##idx = (shift << 4) | (base); + + COST_PENALTY(0, 1, 0) + COST_PENALTY(1, 1, 0) + COST_PENALTY(2, 1, 0) + COST_PENALTY(3, 1, 0) + COST_PENALTY(4, 1, 0) + COST_PENALTY(5, 1, 0) + COST_PENALTY(6, 1, 0) + COST_PENALTY(7, 1, 0) + uint2 cost_table; + cost_table.s0 = cost_penalty_0 | (cost_penalty_1 << 8) | ( cost_penalty_2 << 16) | (cost_penalty_3 << 24); + cost_table.s1 = cost_penalty_4 | (cost_penalty_5 << 8) | ( cost_penalty_6 << 16) | (cost_penalty_7 << 24); + return cost_table; +} + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_motion_vector_cost_function(ulong packed_cost_center_delta, + uint2 packed_cost_table, + uchar cost_precision, + intel_sub_group_avc_ime_payload_t payload){ + intel_sub_group_avc_ime_payload_t pl = payload; + pl.packed_cost_table = packed_cost_table; + pl.cost_precision = cost_precision; + + uint lid_x = get_sub_group_local_id(); + if(lid_x == 0) + pl.cc0 = packed_cost_center_delta; + else if(lid_x == 1) + pl.cc1 = packed_cost_center_delta; + else if(lid_x == 2) + pl.cc2 = packed_cost_center_delta; + else if(lid_x == 3) + pl.cc3 = packed_cost_center_delta; + else{ + } + return pl; +} + +#if REF_ENABLE_COST_PENALTY +intel_sub_group_avc_ref_payload_t +intel_sub_group_avc_ref_set_motion_vector_cost_function(ulong packed_cost_center_delta, + uint2 packed_cost_table, + uchar cost_precision, + intel_sub_group_avc_ref_payload_t payload){ + intel_sub_group_avc_ref_payload_t pl = payload; + pl.packed_cost_table = packed_cost_table; + pl.cost_precision = cost_precision; + + uint lid_x = get_sub_group_local_id(); + if(lid_x == 0) + pl.cc0 = packed_cost_center_delta; + else if(lid_x == 1) + pl.cc1 = packed_cost_center_delta; + else if(lid_x == 2) + pl.cc2 = packed_cost_center_delta; + else if(lid_x == 3) + pl.cc3 = packed_cost_center_delta; + else{ + } + return pl; +} + +#endif + +intel_sub_group_avc_ime_payload_t +intel_sub_group_avc_ime_set_inter_shape_penalty(ulong packed_shape_cost, + intel_sub_group_avc_ime_payload_t payload){ + intel_sub_group_avc_ime_payload_t pl = payload; + pl.packed_shape_cost = packed_shape_cost; + return pl; +} + +intel_sub_group_avc_sic_result_t +intel_sub_group_avc_sic_evaluate_ipe(read_only image2d_t src_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload){ + uint src_grf0_dw7; + uint src_grf0_dw6; + uint src_grf0_dw5; + uint src_grf0_dw4; + uint src_grf0_dw3; + uint src_grf0_dw2; + uint src_grf0_dw1; + uint src_grf0_dw0; + uint src_grf1_dw7; + uint src_grf1_dw6; + uint src_grf1_dw5; + uint src_grf1_dw4; + uint src_grf1_dw3; + uint src_grf1_dw2; + uint src_grf1_dw1; + uint src_grf1_dw0; + uint src_grf2_dw7; + uint src_grf2_dw6; + uint src_grf2_dw5; + uint src_grf2_dw4; + uint src_grf2_dw3; + uint src_grf2_dw2; + uint src_grf2_dw1; + uint src_grf2_dw0; + uint src_grf3_dw7; + uint src_grf3_dw6; + uint src_grf3_dw5; + uint src_grf3_dw4; + uint src_grf3_dw3; + uint src_grf3_dw2; + uint src_grf3_dw1; + uint src_grf3_dw0; + uint src_grf4_dw7; + uint src_grf4_dw6; + uint src_grf4_dw5; + uint src_grf4_dw4; + uint src_grf4_dw3; + uint src_grf4_dw2; + uint src_grf4_dw1; + uint src_grf4_dw0; + uint src_grf5_dw7; + uint src_grf5_dw6; + uint src_grf5_dw5; + uint src_grf5_dw4; + uint src_grf5_dw3; + uint src_grf5_dw2; + uint src_grf5_dw1; + uint src_grf5_dw0; + uint src_grf6_dw7; + uint src_grf6_dw6; + uint src_grf6_dw5; + uint src_grf6_dw4; + uint src_grf6_dw3; + uint src_grf6_dw2; + uint src_grf6_dw1; + uint src_grf6_dw0; + uint src_grf7_dw7; + uint src_grf7_dw6; + uint src_grf7_dw5; + uint src_grf7_dw4; + uint src_grf7_dw3; + uint src_grf7_dw2; + uint src_grf7_dw1; + uint src_grf7_dw0; + + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id); + src_grf0_dw5 = 0; + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22) + //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) + | (0 << 20) | (0 << 19) | (0 << 18) + //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) + | (0 << 17) | (0 << 16) | (0 << 15) + //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) + | (0 << 14) | (0 << 12) | (0 << 11) + //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) + | (0 << 8) | (0 << 7) | (0 << 6) + //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) + | (0 << 4) | (0 << 3) | (0 << 2) + //| (Src_Size); + | (0); + //src_grf0_dw2 = (SrcY << 16) | (SrcX); + src_grf0_dw2 = (payload.srcCoord.y<<16) | (payload.srcCoord.x); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + src_grf0_dw1 = 0; + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + src_grf0_dw0 = 0; + + //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21) + src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21) + //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18) + | (0 << 20) | (0 << 19) | (0 << 18) + //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7) + | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7) + //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask); + | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask); + //src_grf1_dw6 = Reserved; + src_grf1_dw6 = 0; + /*src_grf1_dw5 = Reseverd for BDW+ + src_grf1_dw4 = Reseverd for BDW+*/ + src_grf1_dw5 = 0; + src_grf1_dw4 = 0; + //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15 + src_grf1_dw3 = 0; + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + src_grf1_dw2 = 0; + + /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) + | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) + | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ + src_grf1_dw1 = 0; + /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) + | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) + | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) + | (Adaptive_En << 1) | (SkipModeEn);*/ + src_grf1_dw0 = 0; + + //cost related + src_grf2_dw7 = 0; + src_grf2_dw6 = 0; + src_grf2_dw5 = 0; + src_grf2_dw4 = 0; + src_grf2_dw3 = 0; + src_grf2_dw2 = 0; + src_grf2_dw1 = 0; + //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED); + src_grf2_dw0 = payload.intra_shape_cost; + src_grf3_dw7 = 0; + src_grf3_dw6 = 0; + src_grf3_dw5 = 0; + src_grf3_dw4 = 0; + src_grf3_dw3 = 0; + src_grf3_dw2 = 0; + src_grf3_dw1 = 0; + src_grf3_dw0 = 0; + + //Ref* SkipCenter* Delta XY + /*src_grf4_dw7 = Ref1_SkipCenter_3_Delta_XY; + src_grf4_dw6 = Ref0_SkipCenter_3_Delta_XY; + src_grf4_dw5 = Ref1_SkipCenter_2_Delta_XY; + src_grf4_dw4 = Ref0_SkipCenter_3_Delta_XY; + src_grf4_dw3 = Ref1_SkipCenter_1_Delta_XY; + src_grf4_dw2 = Ref0_SkipCenter_1_Delta_XY; + src_grf4_dw1 = Ref1_SkipCenter_0_Delta_XY; + src_grf4_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16) | (Ref0_Skip_Center_0_Delta_X);*/ + src_grf4_dw7 = 0; + src_grf4_dw6 = 0; + src_grf4_dw5 = 0; + src_grf4_dw4 = 0; + src_grf4_dw3 = 0; + src_grf4_dw2 = 0; + src_grf4_dw1 = 0; + src_grf4_dw0 = 0; + + //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1]; + src_grf5_dw7 = payload.ur_20_23; + //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1]; + src_grf5_dw6 = payload.ur_16_19; + //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1]; + src_grf5_dw5 = payload.u_12_15; + //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1]; + src_grf5_dw4 = payload.u_8_11; + //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1]; + src_grf5_dw3 = payload.u_4_7; + //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16) + //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]); + src_grf5_dw2 = payload.u_0_3; + uchar mode_mask_16_16 = 0xf; + ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff; + if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){ + mode_mask_16_16 = 0; + mode_mask_8_8 = 0; + mode_mask_4_4 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){ + mode_mask_16_16 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){ + mode_mask_8_8 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){ + mode_mask_4_4 = 0; + } + //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8) + //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask); + src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16); + //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask); + src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4); + //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC); + src_grf6_dw7 = 0; + //src_grf6_dw6 = Reserved; + src_grf6_dw6 = 0; + //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]); + src_grf6_dw5 = 0; + //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20) + //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8) + //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5); + //XXX: Which value should be set to? + src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20) + | (2 << 16) | (2 << 12) | (2 << 8) + | (2 << 4) | (2); + //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]); + src_grf6_dw3 = payload.l_12_15; + //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8]; + src_grf6_dw2 = payload.l_8_11; + //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4]; + src_grf6_dw1 = payload.l_4_7; + //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16) + //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]); + src_grf6_dw0 = payload.l_0_3; + + + //chroma related + src_grf7_dw7 = 0; + src_grf7_dw6 = 0; + src_grf7_dw5 = 0; + src_grf7_dw4 = 0; + src_grf7_dw3 = 0; + src_grf7_dw2 = 0; + src_grf7_dw1 = 0; + src_grf7_dw0 = 0; + + + intel_sub_group_avc_sic_result_t ime_result; + ime_result = __gen_ocl_ime(src_image, src_image, + src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, + src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, + src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4, + src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0, + src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4, + src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0, + src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4, + src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0, + src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4, + src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0, + src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4, + src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0, + src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4, + src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0, + src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4, + src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0, + //msg_type + 1); + + return ime_result; +} + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_initialize(ushort2 src_coord ){ + intel_sub_group_avc_sic_payload_t pl; + pl.srcCoord = src_coord; + pl.intra_shape_cost = 0; + return pl; +} + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_configure_ipe(uchar luma_intra_partition_mask, + uchar intra_neighbour_availabilty, + uchar left_edge_luma_pixels, + uchar upper_left_corner_luma_pixel, + uchar upper_edge_luma_pixels, + uchar upper_right_edge_luma_pixels, + uchar intra_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload ){ + intel_sub_group_avc_sic_payload_t pl = payload; + pl.luma_intra_partition_mask = luma_intra_partition_mask; + pl.intra_neighbour_availabilty = intra_neighbour_availabilty; + uchar pixel[16]; + for(uint i = 0; i < 16; i++) + pixel[i] = intel_sub_group_shuffle(left_edge_luma_pixels, i); + + pl.l_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]); + pl.l_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]); + pl.l_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]); + pl.l_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]); + + for(uint i = 0; i < 16; i++) + pixel[i] = intel_sub_group_shuffle(upper_edge_luma_pixels, i); + pl.u_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]); + pl.u_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]); + pl.u_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]); + pl.u_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]); + + for(uint i = 0; i < 8; i++) + pixel[i] = intel_sub_group_shuffle(upper_right_edge_luma_pixels, i); + pl.ur_16_19 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]); + pl.ur_20_23 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]); + + pl.upper_left_corner_luma_pixel = upper_left_corner_luma_pixel; + pl.intra_sad_adjustment = intra_sad_adjustment; + return pl; +} + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_set_intra_luma_shape_penalty(uint packed_shape_cost, + intel_sub_group_avc_sic_payload_t payload ){ + intel_sub_group_avc_sic_payload_t pl = payload; + pl.intra_shape_cost = packed_shape_cost; + return pl; +} + +intel_sub_group_avc_sic_result_t +intel_sub_group_avc_sic_evaluate_with_single_reference(read_only image2d_t src_image, + read_only image2d_t ref_image, + sampler_t vme_media_sampler, + intel_sub_group_avc_sic_payload_t payload){ + uint src_grf0_dw7; + uint src_grf0_dw6; + uint src_grf0_dw5; + uint src_grf0_dw4; + uint src_grf0_dw3; + uint src_grf0_dw2; + uint src_grf0_dw1; + uint src_grf0_dw0; + uint src_grf1_dw7; + uint src_grf1_dw6; + uint src_grf1_dw5; + uint src_grf1_dw4; + uint src_grf1_dw3; + uint src_grf1_dw2; + uint src_grf1_dw1; + uint src_grf1_dw0; + uint src_grf2_dw7; + uint src_grf2_dw6; + uint src_grf2_dw5; + uint src_grf2_dw4; + uint src_grf2_dw3; + uint src_grf2_dw2; + uint src_grf2_dw1; + uint src_grf2_dw0; + uint src_grf3_dw7; + uint src_grf3_dw6; + uint src_grf3_dw5; + uint src_grf3_dw4; + uint src_grf3_dw3; + uint src_grf3_dw2; + uint src_grf3_dw1; + uint src_grf3_dw0; + uint src_grf4_dw7; + uint src_grf4_dw6; + uint src_grf4_dw5; + uint src_grf4_dw4; + uint src_grf4_dw3; + uint src_grf4_dw2; + uint src_grf4_dw1; + uint src_grf4_dw0; + uint src_grf5_dw7; + uint src_grf5_dw6; + uint src_grf5_dw5; + uint src_grf5_dw4; + uint src_grf5_dw3; + uint src_grf5_dw2; + uint src_grf5_dw1; + uint src_grf5_dw0; + uint src_grf6_dw7; + uint src_grf6_dw6; + uint src_grf6_dw5; + uint src_grf6_dw4; + uint src_grf6_dw3; + uint src_grf6_dw2; + uint src_grf6_dw1; + uint src_grf6_dw0; + uint src_grf7_dw7; + uint src_grf7_dw6; + uint src_grf7_dw5; + uint src_grf7_dw4; + uint src_grf7_dw3; + uint src_grf7_dw2; + uint src_grf7_dw1; + uint src_grf7_dw0; + + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id); + src_grf0_dw5 = 0; + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22) + //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) + | (payload.skip_sad_adjustment << 20) | (0 << 19) | (0 << 18) + //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) + | (0 << 17) | (0 << 16) | (0 << 15) + //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) + | (0 << 14) | (0 << 12) | (0 << 11) + //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) + | (0 << 8) | (0 << 7) | (0 << 6) + //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) + | (0 << 4) | (0 << 3) | (0 << 2) + //| (Src_Size); + | (0); + src_grf0_dw3 |= payload.skip_block_partition_type; + //Block-Based Skip Enabled + if(payload.skip_block_partition_type == CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL) + src_grf0_dw3 |= (1 << 19); + //src_grf0_dw2 = (SrcY << 16) | (SrcX); + src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x); + //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); + src_grf0_dw1 = 0; + //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); + src_grf0_dw0 = 0; + + //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21) + src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21) + //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18) + | (0 << 20) | (0 << 19) | (0 << 18) + //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7) + | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7) + //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask); + | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask); + src_grf1_dw7 |= payload.skip_motion_vector_mask; + //src_grf1_dw6 = Reserved; + src_grf1_dw6 = 0; + /*src_grf1_dw5 = (Cost_Center1Y << 16) | (Cost_Center1X); + src_grf1_dw4 = (Cost_Center0Y << 16) | (Cost_Center0X); + src_grf1_dw3 = (Ime_Too_Good << 24 ) | (Ime_Too_Bad << 16) | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/ + src_grf1_dw5 = 0; + src_grf1_dw4 = 0; + src_grf1_dw3 = 0; + //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) + //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); + src_grf1_dw2 = 0; + /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) + | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) + | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ + src_grf1_dw1 = (0 << 24) | (payload.bidirectional_weight << 16) | (16); + /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) + | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) + | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) + | (Adaptive_En << 1) | (SkipModeEn);*/ + src_grf1_dw0 = 1; + + //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6] + src_grf2_dw7 = 0; + //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2] + src_grf2_dw6 = 0; + //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input); + src_grf2_dw5 = 0; + //XXX: TO DO: setting mv cost related bit filed + //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost; + src_grf2_dw4 = 0; + //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost; + src_grf2_dw3 = 0; + //src_grf2_dw2 = (Chroma_Intra_Mode_Cost << 24) | (RefID_Cost << 16) | (Mode_9_Cost << 8) | (Mode_8_Cost); + src_grf2_dw2 = 0; + //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost + src_grf2_dw1 = 0; + //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED); + src_grf2_dw0 = payload.intra_shape_cost; + /* + //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ; + src_grf3_dw7 = payload.cc3 >> 32; + //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ; + src_grf3_dw6 = payload.cc3; + //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ; + src_grf3_dw5 = payload.cc2 >> 32; + //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ; + src_grf3_dw4 = payload.cc2; + //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ; + src_grf3_dw3 = payload.cc1 >> 32; + //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ; + src_grf3_dw2 = payload.cc1; + //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ; + src_grf3_dw1 = payload.cc0 >> 32; + //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ; + src_grf3_dw0 = payload.cc0;*/ + src_grf3_dw7 = 0; + src_grf3_dw6 = 0; + src_grf3_dw5 = 0; + src_grf3_dw4 = 0; + src_grf3_dw3 = 0; + src_grf3_dw2 = 0; + src_grf3_dw1 = 0; + src_grf3_dw0 = 0; + + //Ref1/Ref0 SkipCenter 3...0 Delta XY + int2 bi_mv_temp = as_int2( payload.mv ); + int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3); + src_grf4_dw7 = bi_mv.s1; + src_grf4_dw6 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2); + src_grf4_dw5 = bi_mv.s1; + src_grf4_dw4 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1); + src_grf4_dw3 = bi_mv.s1; + src_grf4_dw2 = bi_mv.s0; + bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0); + src_grf4_dw1 = bi_mv.s1; + src_grf4_dw0 = bi_mv.s0; + + //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1]; + src_grf5_dw7 = payload.ur_20_23; + //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1]; + src_grf5_dw6 = payload.ur_16_19; + //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1]; + src_grf5_dw5 = payload.u_12_15; + //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1]; + src_grf5_dw4 = payload.u_8_11; + //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1]; + src_grf5_dw3 = payload.u_4_7; + //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16) + //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]); + src_grf5_dw2 = payload.u_0_3; + uchar mode_mask_16_16 = 0xf; + ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff; + if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){ + mode_mask_16_16 = 0; + mode_mask_8_8 = 0; + mode_mask_4_4 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){ + mode_mask_16_16 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){ + mode_mask_8_8 = 0; + } + else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){ + mode_mask_4_4 = 0; + } + //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8) + //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask); + src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16); + //src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (0xb); + //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask); + src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4); + //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC); + src_grf6_dw7 = 0; + //src_grf6_dw6 = Reserved; + src_grf6_dw6 = 0; + //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]); + src_grf6_dw5 = 0; + //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20) + //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8) + //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5); + //XXX: Which value should be set to? + src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20) + | (2 << 16) | (2 << 12) | (2 << 8) + | (2 << 4) | (2); + //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]); + src_grf6_dw3 = payload.l_12_15; + //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8]; + src_grf6_dw2 = payload.l_8_11; + //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4]; + src_grf6_dw1 = payload.l_4_7; + //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16) + //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]); + src_grf6_dw0 = payload.l_0_3; + + + //chroma related + src_grf7_dw7 = 0; + src_grf7_dw6 = 0; + src_grf7_dw5 = 0; + src_grf7_dw4 = 0; + src_grf7_dw3 = 0; + src_grf7_dw2 = 0; + src_grf7_dw1 = 0; + src_grf7_dw0 = 0; + + + intel_sub_group_avc_ref_result_t sic_result; + sic_result = __gen_ocl_ime(src_image, ref_image, + src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, + src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, + src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4, + src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0, + src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4, + src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0, + src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4, + src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0, + src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4, + src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0, + src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4, + src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0, + src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4, + src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0, + src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4, + src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0, + //msg_type + 1); + + return sic_result; +} + +intel_sub_group_avc_sic_payload_t +intel_sub_group_avc_sic_configure_skc(uint skip_block_partition_type, + uint skip_motion_vector_mask, + ulong motion_vectors, + char bidirectional_weight, + uchar skip_sad_adjustment, + intel_sub_group_avc_sic_payload_t payload){ + intel_sub_group_avc_sic_payload_t pl = payload; + pl.skip_block_partition_type = skip_block_partition_type; + pl.skip_motion_vector_mask = skip_motion_vector_mask; + pl.bidirectional_weight = bidirectional_weight; + pl.skip_sad_adjustment = skip_sad_adjustment; + pl.mv = motion_vectors; + return pl; +} + +ushort +intel_sub_group_avc_sic_get_inter_distortions(intel_sub_group_avc_sic_result_t result){ + uint lid_x = get_sub_group_local_id(); + uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2); + int start_bit = lid_x%2 * 16; + ushort distortion = (write_back_dw >> start_bit); + return distortion; +} + +uchar +intel_sub_group_avc_sic_get_ipe_luma_shape(intel_sub_group_avc_sic_result_t result){ + uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0); + uchar luma_shape = write_back_dw00 & 0x03; + return luma_shape; +} + +ushort +intel_sub_group_avc_sic_get_best_ipe_luma_distortion(intel_sub_group_avc_sic_result_t result){ + uint write_back_dw03 = intel_sub_group_shuffle(result.s0, 3); + ushort luma_distortion = write_back_dw03; + return luma_distortion; +} + +ulong intel_sub_group_avc_sic_get_packed_ipe_luma_modes(intel_sub_group_avc_sic_result_t result){ + uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0); + uchar luma_shape = write_back_dw00 & 0x03; + ulong luma_modes = 0; + uint write_back_dw04 = intel_sub_group_shuffle(result.s0, 4); + uint write_back_dw05 = intel_sub_group_shuffle(result.s0, 5); + if(luma_shape == CLK_AVC_ME_INTRA_16x16_INTEL) + luma_modes |= (write_back_dw04 & 0x03); + else if(luma_shape == CLK_AVC_ME_INTRA_8x8_INTEL){ + ulong modes_temp = write_back_dw04; + luma_modes = (modes_temp & 0x0f) | ((modes_temp & 0x00f0) << 12) | ((modes_temp & 0x0f00) << 24) | ((modes_temp & 0x0000f000) << 36); + } + else if(luma_shape == CLK_AVC_ME_INTRA_4x4_INTEL){ + ulong modes_temp = write_back_dw05; + luma_modes = (modes_temp << 32) | (write_back_dw04 & 0x00000000ffffffff); + } + return luma_modes; +} + bool __gen_ocl_in_local(size_t p) { bool cond1 = p > 0; bool cond2 = p < 64*1024; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 96c81b92..a9df6525 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -4048,6 +4048,7 @@ namespace gbe case GEN_OCL_SIMD_ID: case GEN_OCL_SIMD_SHUFFLE: case GEN_OCL_VME: + case GEN_OCL_IME: case GEN_OCL_WORK_GROUP_ALL: case GEN_OCL_WORK_GROUP_ANY: case GEN_OCL_WORK_GROUP_BROADCAST: @@ -4953,6 +4954,41 @@ namespace gbe lut_sub_x.getIntegerValue()); break; } + case GEN_OCL_IME: + { + + const uint8_t imageID = getImageID(I); + + AI++; + AI++; + + Constant *msg_type_cpv = dyn_cast(*(AI + 64)); + assert(msg_type_cpv); + const ir::Immediate &msg_type_x = processConstantImm(msg_type_cpv); + int msg_type = msg_type_x.getIntegerValue(); + // msy_type (00: IDM [BDW+], 01: SIC, 10: IME, 11: FBR) + GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); + uint32_t src_length = ((msg_type == 1 || msg_type == 3) ? 64 : 48); + + vector dstTupleData, srcTupleData; + for (uint32_t i = 0; i < src_length; i++, AI++){ + srcTupleData.push_back(this->getRegister(*AI)); + } + + const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], src_length); + + uint32_t dst_length; + dst_length = 7; + for (uint32_t elemID = 0; elemID < dst_length; ++elemID) { + const ir::Register reg = this->getRegister(&I, elemID); + dstTupleData.push_back(reg); + } + const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dst_length); + + ctx.IME(imageID, dstTuple, srcTuple, dst_length, src_length, + msg_type); + break; + } case GEN_OCL_IN_PRIVATE: { const ir::Register dst = this->getRegister(&I); diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index d3802d20..a9873ca4 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -179,6 +179,7 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region) DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private) DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme) +DECL_LLVM_GEN_FUNCTION(IME, __gen_ocl_ime) // printf function DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf_stub) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index be3d5499..2d8d7bae 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -717,6 +717,7 @@ namespace gbe { break; } case GEN_OCL_VME: + case GEN_OCL_IME: case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2: case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4: case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8: diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 55b1a230..43ff8fed 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -154,6 +154,13 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, image->intel_fmt, image->image_type, image->bpp, image->w, image->h, image->depth, image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); + //We always setup media surface state, so this surface can be used for vme + else if( (image->fmt.image_channel_order == CL_R) && (image->fmt.image_channel_data_type == CL_UNORM_INT8) ) + cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, + image->offset + k->args[id].mem->offset, + image->intel_fmt, image->image_type, image->bpp, + image->w, image->h, image->depth, + image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); } return CL_SUCCESS; } diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 1960463e..5e284193 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -576,6 +576,7 @@ skl_gt1_break: #endif cl_intel_platform_get_default_extension(ret); cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); + cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id); break; case PCI_CHIP_SKYLAKE_ULT_GT2: @@ -601,6 +602,7 @@ skl_gt2_break: #endif cl_intel_platform_get_default_extension(ret); cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); + cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id); break; case PCI_CHIP_SKYLAKE_ULT_GT3: @@ -624,6 +626,7 @@ skl_gt3_break: cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id); #endif cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); + cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id); break; case PCI_CHIP_SKYLAKE_DT_GT4: @@ -643,6 +646,7 @@ skl_gt4_break: #endif cl_intel_platform_get_default_extension(ret); cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id); + cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id); break; case PCI_CHIP_BROXTON_0: diff --git a/src/cl_extensions.c b/src/cl_extensions.c index 56099ad0..4987bee2 100644 --- a/src/cl_extensions.c +++ b/src/cl_extensions.c @@ -70,7 +70,7 @@ check_intel_extension(cl_extensions_t *extensions) int id; for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++) { - if(id != EXT_ID(intel_motion_estimation)) + if(id != EXT_ID(intel_motion_estimation) && id != EXT_ID(intel_device_side_avc_motion_estimation)) extensions->extensions[id].base.ext_enabled = 1; if(id == EXT_ID(intel_required_subgroup_size)) #if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR > 40 diff --git a/src/cl_extensions.h b/src/cl_extensions.h index bb61c0bc..b32b2362 100644 --- a/src/cl_extensions.h +++ b/src/cl_extensions.h @@ -32,7 +32,8 @@ DECL_EXT(intel_subgroups_short) \ DECL_EXT(intel_required_subgroup_size) \ DECL_EXT(intel_media_block_io) \ - DECL_EXT(intel_planar_yuv) + DECL_EXT(intel_planar_yuv) \ + DECL_EXT(intel_device_side_avc_motion_estimation) #define DECL_GL_EXTENSIONS \ DECL_EXT(khr_gl_sharing)\ @@ -67,7 +68,7 @@ cl_khr_extension_id_max #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics) #define OPT1_EXT_END_ID EXT_ID(khr_icd) #define INTEL_EXT_START_ID EXT_ID(intel_accelerator) -#define INTEL_EXT_END_ID EXT_ID(intel_planar_yuv) +#define INTEL_EXT_END_ID EXT_ID(intel_device_side_avc_motion_estimation) #define GL_EXT_START_ID EXT_ID(khr_gl_sharing) #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 2b778e5a..b0d6bd94 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1337,6 +1337,75 @@ intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu, assert(index < GEN_MAX_SURFACES); } +static void +intel_gpgpu_bind_image_for_vme_gen9(intel_gpgpu_t *gpgpu, + uint32_t index, + dri_bo* obj_bo, + uint32_t obj_bo_offset, + uint32_t format, + cl_mem_object_type type, + uint32_t bpp, + int32_t w, + int32_t h, + int32_t depth, + int32_t pitch, + int32_t slice_pitch, + int32_t tiling) +{ + surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset; + gen9_media_surface_state_t *ss = (gen9_media_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)]; + + memset(ss, 0, sizeof(gen8_surface_state_t)); + ss->ss0.rotation = 0; //++ + ss->ss1.uv_offset_v_direction = 0; + ss->ss1.pic_struct = 0; + ss->ss1.width = w - 1; + ss->ss1.height = h - 1; + if (tiling == GPGPU_NO_TILE) { + ss->ss2.tile_mode = 0; + } + else if (tiling == GPGPU_TILE_X){ + ss->ss2.tile_mode = 2; + } + else if (tiling == GPGPU_TILE_Y){ + ss->ss2.tile_mode = 3; + } + ss->ss2.half_pitch_for_chroma = 0; + ss->ss2.surface_pitch = pitch - 1; + ss->ss2.address_control = 1; //++ CLAMP: 0; MIRROR:1; + ss->ss2.mem_compress_enable = 0; //++ + ss->ss2.mem_compress_mode = 0; //++ + ss->ss2.uv_offset_v_direction_msb = 0; //++ + ss->ss2.uv_offset_u_direction = 0; //++ + ss->ss2.interleave_chroma = 0; + ss->ss2.surface_format = 12; //Y8_UNORM + //ss->ss2.surface_format = 4; //PLANAR_420_8 + ss->ss3.y_offset_for_u = 0; + ss->ss3.x_offset_for_u = 0; + ss->ss4.y_offset_for_v = 0; + ss->ss4.x_offset_for_v = 0; + ss->ss5.surface_object_control_state = cl_gpgpu_get_cache_ctrl(); + ss->ss5.tiled_res_mode = 0; //++ TRMODE_NONE: 0; TRMODE_TILEYF: 1; TRMODE_TILEYS:2 + ss->ss5.vert_line_stride_offset = 0; //++ + ss->ss5.vert_line_stride = 0; //++ + ss->ss6.base_addr = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; // + ss->ss7.base_addr_high = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; // + + + heap->binding_table[index] = offsetof(surface_heap_t, surface) + + index * surface_state_sz; + dri_bo_emit_reloc(gpgpu->aux_buf.bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + obj_bo_offset, + gpgpu->aux_offset.surface_heap_offset + + heap->binding_table[index] + + offsetof(gen9_media_surface_state_t, ss6), + obj_bo); + + assert(index < GEN_MAX_SURFACES); +} + static void intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu, @@ -2562,6 +2631,7 @@ intel_set_gpgpu_callbacks(int device_id) } if (IS_GEN9(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9; + cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen9; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8; diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h index b38cc423..282929d7 100644 --- a/src/intel/intel_structs.h +++ b/src/intel/intel_structs.h @@ -425,6 +425,69 @@ typedef struct gen7_media_surface_state } ss7; } gen7_media_surface_state_t; +typedef struct gen9_media_surface_state +{ + struct { + uint32_t pad3:12; + uint32_t pad2:4; + uint32_t pad1:11; //ExistsIf [Surface Format] is not one of Planar Formats + uint32_t rotation:2; + } ss0; + + struct { + uint32_t uv_offset_v_direction:2; + uint32_t pic_struct:2; + uint32_t width:14; + uint32_t height:14; + } ss1; + + struct { + uint32_t tile_mode:2; + uint32_t half_pitch_for_chroma:1; + uint32_t surface_pitch:18; + uint32_t address_control:1; + uint32_t mem_compress_enable:1; + uint32_t mem_compress_mode:1; + uint32_t uv_offset_v_direction_msb:1; + uint32_t uv_offset_u_direction:1; + uint32_t interleave_chroma:1; + uint32_t surface_format:5; + } ss2; + + struct { + uint32_t y_offset_for_u:14; + uint32_t pad1:2; + uint32_t x_offset_for_u:14; + uint32_t pad0:2; + } ss3; + + struct { + uint32_t y_offset_for_v:15; + uint32_t pad1:1; + uint32_t x_offset_for_v:14; + uint32_t pad0:2; + } ss4; + + struct { + uint32_t surface_object_control_state:7; + uint32_t pad2:11; + uint32_t tiled_res_mode:2; + uint32_t pad1:4; + uint32_t pad0:6; + uint32_t vert_line_stride_offset:1; + uint32_t vert_line_stride:1; + } ss5; + + struct { + uint32_t base_addr; + } ss6; + + struct { + uint32_t base_addr_high:16; + uint32_t pad0:16; + } ss7; +} gen9_media_surface_state_t; + typedef union gen_surface_state { gen7_surface_state_t gen7_surface_state; diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp index 2e826bc6..52d17146 100644 --- a/utests/utest_helper.cpp +++ b/utests/utest_helper.cpp @@ -895,6 +895,24 @@ int cl_check_motion_estimation(void) return 1; } +int cl_check_device_side_avc_motion_estimation(void) +{ + std::string extStr; + size_t param_value_size; + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, ¶m_value_size); + std::vector param_value(param_value_size); + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size, + param_value.empty() ? NULL : ¶m_value.front(), ¶m_value_size); + if (!param_value.empty()) + extStr = std::string(¶m_value.front(), param_value_size-1); + + if (std::strstr(extStr.c_str(), "cl_intel_device_side_avc_motion_estimation") == NULL) { + printf("No cl_intel_device_side_avc_motion_estimation, Skip!"); + return 0; + } + return 1; +} + int cl_check_subgroups(void) { std::string extStr; diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp index c3040087..fe6d1bbb 100644 --- a/utests/utest_helper.hpp +++ b/utests/utest_helper.hpp @@ -315,6 +315,9 @@ extern clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR; /* Check if cl_intel_motion_estimation enabled. */ extern int cl_check_motion_estimation(void); +/* Check if cl_intel_device_side_avc_motion_estimation enabled. */ +extern int cl_check_device_side_avc_motion_estimation(void); + /* Check is cl version 2.0 or Beignet extension. */ extern int cl_check_ocl20(bool or_beignet = true); -- cgit v1.2.1