/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * */ /** * \file gen9_context.cpp */ #include "backend/gen9_context.hpp" #include "backend/gen_insn_selection.hpp" #include "backend/gen_program.hpp" namespace gbe { void Gen9Context::newSelection(void) { this->sel = GBE_NEW(Selection9, *this); } void Gen9Context::emitBarrierInstruction(const SelectionInstruction &insn) { const GenRegister src = ra->genReg(insn.src(0)); const GenRegister fenceDst = ra->genReg(insn.dst(0)); uint32_t barrierType = insn.extra.barrierType; const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE; if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) { p->FENCE(fenceDst, imageFence); p->MOV(fenceDst, fenceDst); } p->push(); // As only the payload.2 is used and all the other regions are ignored // SIMD8 mode here is safe. p->curr.execWidth = 8; p->curr.physicalFlag = 0; p->curr.noMask = 1; // Copy barrier id from r0. p->AND(src, barrierId, GenRegister::immud(0x8f000000)); // A barrier is OK to start the thread synchronization *and* SLM fence p->BARRIER(src); p->curr.execWidth = 1; // Now we wait for the other threads p->curr.predicate = GEN_PREDICATE_NONE; p->WAIT(); p->pop(); if (imageFence) { p->FLUSH_SAMPLERCACHE(fenceDst); p->MOV(fenceDst, fenceDst); } } void Gen9Context::emitImeInstruction(const SelectionInstruction &insn) { const GenRegister dst = ra->genReg(insn.dst(0)); const unsigned int msg_type = insn.extra.ime_msg_type; GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); uint32_t execWidth_org = p->curr.execWidth; int virt_pld_len; int phi_pld_len = 0; int virt_rsp_len; #define PHI_SIC_PAYLOAD_LEN 8 #define PHI_IME_PAYLOAD_LEN 6 #define PHI_VME_WRITEBACK_LEN 7 if(msg_type == 1 || msg_type == 2 || msg_type == 3) virt_rsp_len = PHI_VME_WRITEBACK_LEN; if(msg_type == 1 || msg_type == 3) phi_pld_len = PHI_SIC_PAYLOAD_LEN; else if(msg_type == 2) phi_pld_len = PHI_IME_PAYLOAD_LEN; if(execWidth_org == 8) virt_pld_len = phi_pld_len; else if(execWidth_org == 16) virt_pld_len = (phi_pld_len + 1) / 2; p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; p->curr.execWidth = 1; /* Now cl_intel_device_side_avc_motion_estimation is impelemented based on simd16 mode. * So fall back to simd8 is not acceptable now. * */ GBE_ASSERT(execWidth_org == 16); /* Use MOV to Setup bits of payload: mov payload value stored in insn.src(x) to * consecutive payload grf. * In simd8 mode, one virtual grf register map to one physical grf register. But * in simd16 mode, one virtual grf register map to two physical grf registers. * So we should treat them differently. * */ if(execWidth_org == 8){ for(int i=0; i < virt_pld_len; i++){ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); payload_grf.vstride = GEN_VERTICAL_STRIDE_0; payload_grf.width = GEN_WIDTH_1; payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; payload_grf.subphysical = 1; for(int j=0; j < 8; j++){ payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD); GenRegister payload_val = ra->genReg(insn.src(i*8+j)); payload_val.vstride = GEN_VERTICAL_STRIDE_0; payload_val.width = GEN_WIDTH_1; payload_val.hstride = GEN_HORIZONTAL_STRIDE_0; p->MOV(payload_grf, payload_val); } } } else if(execWidth_org == 16){ for(int i=0; i < virt_pld_len; i++){ int nr_num = 2; if( (i == virt_pld_len-1) && (phi_pld_len%2 == 1) ) nr_num = 1; for(int k = 0; k < nr_num; k++){ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i)); payload_grf.nr += k; payload_grf.vstride = GEN_VERTICAL_STRIDE_0; payload_grf.width = GEN_WIDTH_1; payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0; payload_grf.subphysical = 1; for(int j=0; j < 8; j++){ payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD); GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j)); payload_val.vstride = GEN_VERTICAL_STRIDE_0; payload_val.width = GEN_WIDTH_1; payload_val.hstride = GEN_HORIZONTAL_STRIDE_0; p->MOV(payload_grf, payload_val); } } } } p->pop(); #undef PHI_SIC_PAYLOAD_LEN #undef PHI_IME_PAYLOAD_LEN #undef PHI_VME_WRITEBACK_LEN p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; p->curr.execWidth = 1; GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(virt_rsp_len)), GEN_TYPE_UB); payload_did.vstride = GEN_VERTICAL_STRIDE_0; payload_did.width = GEN_WIDTH_1; payload_did.hstride = GEN_HORIZONTAL_STRIDE_0; payload_did.subphysical = 1; payload_did.subnr = 20 * typeSize(GEN_TYPE_UB); GenRegister grf0 = GenRegister::ub1grf(0, 20); p->MOV(payload_did, grf0); p->pop(); const GenRegister msgPayload = ra->genReg(insn.dst(virt_rsp_len)); const unsigned char bti = insn.getbti(); p->IME(bti, dst, msgPayload, msg_type); } void BxtContext::newSelection(void) { this->sel = GBE_NEW(SelectionBxt, *this); } void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h, GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l) { src0.type = src1.type = GEN_TYPE_UD; dst_h.type = dst_l.type = GEN_TYPE_UL; s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL; //GenRegister tmp; GenRegister s0l = unpacked_ud(src0); GenRegister s1l = unpacked_ud(src1); GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l p->MOV(s0h, GenRegister::offset(s0l, 0, 4)); p->MOV(s1h, GenRegister::offset(s1l, 0, 4)); /* High 32 bits X High 32 bits. */ p->MUL(dst_h, s0h, s1h); /* High 32 bits X low 32 bits. */ p->MUL(s0h_s1l, s0h, s1l); /* Low 32 bits X high 32 bits. */ p->MUL(s0l_s1h, s0l, s1h); /* Low 32 bits X low 32 bits. */ p->MUL(dst_l, s0l, s1l); /* Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32 The max of addding 2 32bits integer to it is 2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1 which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not overflow and have no carry. By this manner, we can avoid using acc register, which has a lot of restrictions. */ GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h); p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l); p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32)); GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h); p->ADD(dst_h, dst_h, s0l_s1h_h); GenRegister dst_l_h = unpacked_ud(s0l_s1h); p->MOV(dst_l_h, unpacked_ud(dst_l, 1)); p->ADD(s0h_s1l, s0h_s1l, dst_l_h); // No longer need s0l_s1h GenRegister tmp = s0l_s1h; p->SHL(tmp, s0h_s1l, GenRegister::immud(32)); GenRegister tmp_unpacked = unpacked_ud(tmp, 1); p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked); p->SHR(tmp, s0h_s1l, GenRegister::immud(32)); p->ADD(dst_h, dst_h, tmp); } void BxtContext::emitI64MULInstruction(const SelectionInstruction &insn) { GenRegister src0 = ra->genReg(insn.src(0)); GenRegister src1 = ra->genReg(insn.src(1)); GenRegister dst = ra->genReg(insn.dst(0)); GenRegister res = ra->genReg(insn.dst(1)); src0.type = src1.type = GEN_TYPE_UD; dst.type = GEN_TYPE_UL; res.type = GEN_TYPE_UL; /* Low 32 bits X low 32 bits. */ GenRegister s0l = unpacked_ud(src0); GenRegister s1l = unpacked_ud(src1); p->MUL(dst, s0l, s1l); /* Low 32 bits X high 32 bits. */ GenRegister s1h = unpacked_ud(res); p->MOV(s1h, unpacked_ud(src1, 1)); p->MUL(res, s0l, s1h); p->SHL(res, res, GenRegister::immud(32)); p->ADD(dst, dst, res); /* High 32 bits X low 32 bits. */ GenRegister s0h = unpacked_ud(res); p->MOV(s0h, unpacked_ud(src0, 1)); p->MUL(res, s0h, s1l); p->SHL(res, res, GenRegister::immud(32)); p->ADD(dst, dst, res); } void BxtContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) { if (sz == 0) sz = 16; GBE_ASSERT(sz%4 == 0); GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096); p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; for (int i = 0; i < sz/2; i++) { p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD), GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2])); } p->pop(); } void BxtContext::emitStackPointer(void) { using namespace ir; // Only emit stack pointer computation if we use a stack if (kernel->getStackSize() == 0) return; // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); GBE_ASSERT(perLaneSize > 0); const GenRegister selStatckPtr = this->simdWidth == 8 ? GenRegister::ud8grf(ir::ocl::stackptr) : GenRegister::ud16grf(ir::ocl::stackptr); const GenRegister stackptr = ra->genReg(selStatckPtr); // borrow block ip as temporary register as we will // initialize block ip latter. const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD); loadLaneID(stackptr); // We compute the per-lane stack pointer here // threadId * perThreadSize + laneId*perLaneSize or // (threadId * simdWidth + laneId)*perLaneSize // let private address start from zero //p->MOV(stackptr, GenRegister::immud(0)); p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth p->curr.execWidth = this->simdWidth; p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K p->curr.execWidth = 1; p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize)); p->curr.execWidth = this->simdWidth; p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize if (fn.getPointerFamily() == ir::FAMILY_QWORD) { const GenRegister selStatckPtr2 = this->simdWidth == 8 ? GenRegister::ul8grf(ir::ocl::stackptr) : GenRegister::ul16grf(ir::ocl::stackptr); GenRegister stackptr2 = ra->genReg(selStatckPtr2); GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr); int simdWidth = p->curr.execWidth; if (simdWidth == 16) { // we need do second quarter first, because the dst type is QW, // while the src is DW. If we do first quater first, the 1st // quarter's dst would contain the 2nd quarter's src. p->curr.execWidth = 8; p->curr.quarterControl = GEN_COMPRESSION_Q2; p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1)); p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1)); } p->curr.quarterControl = GEN_COMPRESSION_Q1; p->MOV(sp, stackptr); p->MOV(stackptr2, sp); } p->pop(); } void KblContext::newSelection(void) { this->sel = GBE_NEW(SelectionKbl, *this); } void GlkContext::newSelection(void) { this->sel = GBE_NEW(SelectionGlk, *this); } }