/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Benjamin Segovia */ /** * \file gen_insn_selection.cpp * \author Benjamin Segovia */ /* This is the instruction selection code. First of all, this is a bunch of c++ * crap. Sorry if this is not that readable. Anyway, the goal here is to take * GenIR code (i.e. the very regular, very RISC IR) and to produce GenISA with * virtual registers (i.e. regular GenIR registers). * * Overall idea: * ============= * * There is a lot of papers and research about that but I tried to keep it * simple. No dynamic programming, nothing like this. Just a recursive maximal * munch. * * Basically, the code is executed per basic block from bottom to top. Patterns * of GenIR instructions are defined and each instruction is matched against the * best pattern i.e. the pattern that catches the largest number of * instructions. Once matched, a sequence of instructions is output. * * Each instruction the match depends on is then marked as "root" i.e. we * indicate that each of these instructions must be generated: we indeed need their * destinations for the next instructions (remember that we generate the code in * reverse order) * * Patterns: * ========= * * There is a lot of patterns and I did not implement all of them obviously. I * just quickly gather the complete code to make pattern implementation kind of * easy. This is pretty verbose to add a pattern but it should be not too hard * to add new ones. * * To create and register patterns, I just abused C++ pre-main. A bunch of * patterns is then created and sorted per opcode (i.e. the opcode of the root * of the pattern): this creates a library of patterns that may be used in * run-time. * * Predication / Masking and CFG linearization * =========================================== * * The current version is based on an unfortunate choice. Basically, the problem * to solve is how to map unstructured branches (i.e. regular gotos) onto Gen. * Gen has a native support for structured branches (if/else/endif/while...) but * nothing really native for unstructured branches. * * The idea we implemented is simple. We stole one flag register (here f0.0) to * mask all the instructions (and only activate the proper SIMD lanes) and we * use the CFG linearization technique to properly handle the control flow. This * is not really good for one particular reason: Gen instructions must use the * *same* flag register for the predicates (used for masking) and the * conditional modifier (used as a destination for CMP). This leads to extra * complications with compare instructions and select instructions. Basically, * we need to insert extra MOVs. * * Also, there is some extra kludge to handle the predicates for JMPI. * * TODO: * ===== * * Sadly, I recreated here a new DAG class. This is just a bad idea since we * already have the DAG per basic block with the Function graph i.e. the * complete graph of uses and definitions. I think we should be able to save a * lot of code here if we can simply reuse the code from UD / DU chains. * * Finally, cross-block instruction selection is quite possible with this simple * approach. Basically, instructions from dominating blocks could be merged and * matched with other instructions in the dominated block. This leads to the * interesting approach which consists in traversing the dominator tree in post * order * * We already use if/endif to enclose each basic block. We will continue to identify * those blocks which could match to structured branching and use pure structured * instruction to handle them completely. */ #include "backend/gen_insn_selection.hpp" #include "backend/gen_context.hpp" #include "ir/function.hpp" #include "ir/liveness.hpp" #include "ir/profile.hpp" #include "sys/cvar.hpp" #include "sys/vector.hpp" #include #include namespace gbe { /////////////////////////////////////////////////////////////////////////// // Helper functions /////////////////////////////////////////////////////////////////////////// uint32_t getGenType(ir::Type type) { using namespace ir; switch (type) { case TYPE_BOOL: return GEN_TYPE_W; case TYPE_S8: return GEN_TYPE_B; case TYPE_U8: return GEN_TYPE_UB; case TYPE_S16: return GEN_TYPE_W; case TYPE_U16: return GEN_TYPE_UW; case TYPE_S32: return GEN_TYPE_D; case TYPE_U32: return GEN_TYPE_UD; case TYPE_S64: return GEN_TYPE_L; case TYPE_U64: return GEN_TYPE_UL; case TYPE_FLOAT: return GEN_TYPE_F; case TYPE_DOUBLE: return GEN_TYPE_DF; case TYPE_HALF: return GEN_TYPE_HF; default: NOT_SUPPORTED; return GEN_TYPE_F; } } ir::Type getIRType(uint32_t genType) { using namespace ir; switch (genType) { case GEN_TYPE_B: return TYPE_S8; case GEN_TYPE_UB: return TYPE_U8; case GEN_TYPE_W: return TYPE_S16; case GEN_TYPE_UW: return TYPE_U16; case GEN_TYPE_D: return TYPE_S32; case GEN_TYPE_UD: return TYPE_U32; case GEN_TYPE_L: return TYPE_S64; case GEN_TYPE_UL: return TYPE_U64; case GEN_TYPE_F: return TYPE_FLOAT; case GEN_TYPE_DF: return TYPE_DOUBLE; case GEN_TYPE_HF : return TYPE_HALF; default: NOT_SUPPORTED; return TYPE_FLOAT; } } uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) { using namespace ir; switch (opcode) { case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G; case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE; case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L; case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE; case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ; case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ; default: NOT_SUPPORTED; return 0u; }; } /////////////////////////////////////////////////////////////////////////// // SelectionInstruction /////////////////////////////////////////////////////////////////////////// SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) : parent(NULL), opcode(op), dstNum(dst), srcNum(src) { extra = { 0 }; } void SelectionInstruction::prepend(SelectionInstruction &other) { gbe::prepend(&other, this); other.parent = this->parent; } void SelectionInstruction::append(SelectionInstruction &other) { gbe::append(&other, this); other.parent = this->parent; } bool SelectionInstruction::isRead(void) const { return this->opcode == SEL_OP_UNTYPED_READ || this->opcode == SEL_OP_UNTYPED_READA64 || this->opcode == SEL_OP_READ64 || this->opcode == SEL_OP_READ64A64 || this->opcode == SEL_OP_ATOMIC || this->opcode == SEL_OP_ATOMICA64 || this->opcode == SEL_OP_BYTE_GATHER || this->opcode == SEL_OP_BYTE_GATHERA64 || this->opcode == SEL_OP_SAMPLE || this->opcode == SEL_OP_VME || this->opcode == SEL_OP_IME || this->opcode == SEL_OP_DWORD_GATHER || this->opcode == SEL_OP_OBREAD || this->opcode == SEL_OP_MBREAD; } bool SelectionInstruction::modAcc(void) const { return this->opcode == SEL_OP_I64SUB || this->opcode == SEL_OP_I64ADD || this->opcode == SEL_OP_MUL_HI || this->opcode == SEL_OP_HADD || this->opcode == SEL_OP_RHADD || this->opcode == SEL_OP_I64MUL || this->opcode == SEL_OP_I64_MUL_HI || this->opcode == SEL_OP_I64MADSAT || this->opcode == SEL_OP_I64DIV || this->opcode == SEL_OP_I64REM || this->opcode == SEL_OP_MACH; } bool SelectionInstruction::isWrite(void) const { return this->opcode == SEL_OP_UNTYPED_WRITE || this->opcode == SEL_OP_UNTYPED_WRITEA64 || this->opcode == SEL_OP_WRITE64 || this->opcode == SEL_OP_WRITE64A64 || this->opcode == SEL_OP_ATOMIC || this->opcode == SEL_OP_ATOMICA64 || this->opcode == SEL_OP_BYTE_SCATTER || this->opcode == SEL_OP_BYTE_SCATTERA64 || this->opcode == SEL_OP_TYPED_WRITE || this->opcode == SEL_OP_OBWRITE || this->opcode == SEL_OP_MBWRITE; } bool SelectionInstruction::isBranch(void) const { return this->opcode == SEL_OP_JMPI; } bool SelectionInstruction::isLabel(void) const { return this->opcode == SEL_OP_LABEL; } bool SelectionInstruction::sameAsDstRegion(uint32_t srcID) { assert(srcID < srcNum); if (dstNum == 0) return true; GenRegister &srcReg = this->src(srcID); for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const GenRegister &dstReg = this->dst(dstID); if (!dstReg.isSameRegion(srcReg)) return false; } return true; } bool SelectionInstruction::isNative(void) const { return this->opcode == SEL_OP_NOT || /* ALU1 */ this->opcode == SEL_OP_LZD || this->opcode == SEL_OP_RNDZ || this->opcode == SEL_OP_RNDE || this->opcode == SEL_OP_RNDD || this->opcode == SEL_OP_RNDU || this->opcode == SEL_OP_FRC || this->opcode == SEL_OP_F16TO32 || this->opcode == SEL_OP_F32TO16 || this->opcode == SEL_OP_CBIT || this->opcode == SEL_OP_SEL || /* ALU2 */ this->opcode == SEL_OP_AND || this->opcode == SEL_OP_OR || this->opcode == SEL_OP_XOR || this->opcode == SEL_OP_SHR || this->opcode == SEL_OP_SHL || this->opcode == SEL_OP_RSR || this->opcode == SEL_OP_RSL || this->opcode == SEL_OP_ASR || this->opcode == SEL_OP_SEL || this->opcode == SEL_OP_ADD || this->opcode == SEL_OP_MUL || this->opcode == SEL_OP_FBH || this->opcode == SEL_OP_FBL || this->opcode == SEL_OP_MACH || this->opcode == SEL_OP_MATH || this->opcode == SEL_OP_LRP || /* ALU3 */ this->opcode == SEL_OP_MAD; } /////////////////////////////////////////////////////////////////////////// // SelectionVector /////////////////////////////////////////////////////////////////////////// SelectionVector::SelectionVector(void) : insn(NULL), reg(NULL), regNum(0), isSrc(0) {} /////////////////////////////////////////////////////////////////////////// // SelectionBlock /////////////////////////////////////////////////////////////////////////// SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), endifLabel( (ir::LabelIndex) 0){} void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); } void SelectionBlock::append(SelectionInstruction *insn) { this->insnList.push_back(insn); insn->parent = this; } void SelectionBlock::prepend(SelectionInstruction *insn) { this->insnList.push_front(insn); insn->parent = this; } void SelectionBlock::append(SelectionVector *vec) { this->vectorList.push_back(vec); } #define LD_MSG_ORDER_IVB 7 #define LD_MSG_ORDER_SKL 9 /////////////////////////////////////////////////////////////////////////// // Maximal munch selection on DAG /////////////////////////////////////////////////////////////////////////// /*! All instructions in a block are organized into a DAG */ class SelectionDAG { public: INLINE SelectionDAG(const ir::Instruction &insn) : insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) { GBE_ASSERT(insn.getSrcNum() <= ir::Instruction::MAX_SRC_NUM); for (uint32_t childID = 0; childID < childNum; ++childID) this->child[childID] = NULL; computeBool = false; isUsed = false; } /*! Mergeable are non-root instructions with valid sources */ INLINE void setAsMergeable(uint32_t which) { mergeable|=(1< opcodes; /*! Number of instruction generated */ uint32_t insnNum; /*! Cost of the pattern */ uint32_t cost; }; /*! Store and sort all the patterns. This is our global library we use for the * code selection */ class SelectionLibrary { public: /*! Will register all the patterns */ SelectionLibrary(void); /*! Release and destroy all the registered patterns */ ~SelectionLibrary(void); /*! Insert the given pattern for all associated opcodes */ template void insert(void); /*! One list of pattern per opcode */ typedef vector PatternList; /*! All lists of patterns properly sorted per opcode */ PatternList patterns[ir::OP_INVALID]; /*! All patterns to free */ vector toFree; }; /////////////////////////////////////////////////////////////////////////// // Code selection internal implementation /////////////////////////////////////////////////////////////////////////// /*! Actual implementation of the instruction selection engine */ class Selection::Opaque { public: /*! simdWidth is the default width for the instructions */ Opaque(GenContext &ctx); /*! Release everything */ virtual ~Opaque(void); /*! Implements the instruction selection itself */ void select(void); /*! Start a backward generation (from the end of the block) */ void startBackwardGeneration(void); /*! End backward code generation and output the code in the block */ void endBackwardGeneration(void); /*! Implement public class */ uint32_t getLargestBlockSize(void) const; /*! Implement public class */ INLINE uint32_t getVectorNum(void) const { return this->vectorNum; } /*! Implement public class */ INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov); /*! Implement public class */ INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov); /*! spill a register (insert spill/unspill instructions) */ INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool); bool has32X32Mul() const { return bHas32X32Mul; } bool hasSends() const { return bHasSends; } void setHas32X32Mul(bool b) { bHas32X32Mul = b; } void setHasSends(bool b) { bHasSends = b; } bool hasLongType() const { return bHasLongType; } bool hasDoubleType() const { return bHasDoubleType; } bool hasHalfType() const { return bHasHalfType; } void setHasLongType(bool b) { bHasLongType = b; } void setHasDoubleType(bool b) { bHasDoubleType = b; } void setHasHalfType(bool b) { bHasHalfType = b; } bool hasLongRegRestrict() { return bLongRegRestrict; } void setLongRegRestrict(bool b) { bLongRegRestrict = b; } void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; } uint32_t getLdMsgOrder() const { return ldMsgOrder; } void setSlowByteGather(bool b) { slowByteGather = b; } bool getSlowByteGather() { return slowByteGather; } /*! indicate whether a register is a scalar/uniform register. */ INLINE bool isPartialWrite(const ir::Register ®) const { return partialWriteRegs.find(reg.value()) != partialWriteRegs.end(); } INLINE bool isScalarReg(const ir::Register ®) const { const ir::RegisterData ®Data = getRegisterData(reg); return regData.isUniform(); } INLINE bool isLongReg(const ir::Register ®) const { const ir::RegisterData ®Data = getRegisterData(reg); return regData.family == ir::FAMILY_QWORD; } INLINE GenRegister unpacked_ud(const ir::Register ®) const { return GenRegister::unpacked_ud(reg, isScalarReg(reg)); } INLINE GenRegister unpacked_uw(const ir::Register ®) const { return GenRegister::unpacked_uw(reg, isScalarReg(reg), isLongReg(reg)); } INLINE GenRegister unpacked_ub(const ir::Register ®) const { return GenRegister::unpacked_ub(reg, isScalarReg(reg)); } INLINE GenRegister getOffsetReg(GenRegister reg, int nr, int subnr, bool isDst = true) { if (isDst) partialWriteRegs.insert(reg.value.reg); return GenRegister::offset(reg, nr, subnr); } GenRegister getLaneIDReg(); /*! Implement public class */ INLINE uint32_t getRegNum(void) const { return file.regNum(); } /*! Implements public interface */ INLINE ir::RegisterData getRegisterData(ir::Register reg) const { return file.get(reg); } /*! Implement public class */ INLINE ir::RegisterFamily getRegisterFamily(ir::Register reg) const { return file.get(reg).family; } /*! Implement public class */ SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum); /*! Return the selection register from the GenIR one */ GenRegister selReg(ir::Register, ir::Type type = ir::TYPE_FLOAT) const; /*! Compute the nth register part when using SIMD8 with Qn (n in 2,3,4) */ GenRegister selRegQn(ir::Register, uint32_t quarter, ir::Type type = ir::TYPE_FLOAT) const; /*! Size of the stack (should be large enough) */ enum { MAX_STATE_NUM = 16 }; /*! Push the current instruction state */ INLINE void push(void) { assert(stateNum < MAX_STATE_NUM); stack[stateNum++] = curr; } /*! Pop the latest pushed state */ INLINE void pop(void) { assert(stateNum > 0); curr = stack[--stateNum]; } /*! Create a new register in the register file and append it in the * temporary list of the current block */ INLINE ir::Register reg(ir::RegisterFamily family, bool scalar = false) { GBE_ASSERT(block != NULL); const ir::Register reg = file.append(family, scalar); block->append(reg); return reg; } /*! Append a block at the block stream tail. It becomes the current block */ void appendBlock(const ir::BasicBlock &bb); /*! Append an instruction in the current block */ SelectionInstruction *appendInsn(SelectionOpcode, uint32_t dstNum, uint32_t srcNum); /*! Append a new vector of registers in the current block */ SelectionVector *appendVector(void); /*! Build a DAG for the basic block (return number of instructions) */ uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb); /*! Perform the selection on the basic block */ void matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum); /*! an instruction has a QWORD family src or dst operand. */ bool hasQWord(const ir::Instruction &insn); /*! A root instruction needs to be generated */ bool isRoot(const ir::Instruction &insn) const; /*! Set debug infomation to Selection */ void setDBGInfo_SEL(DebugInfo in) { DBGInfo = in; } /*! To handle selection block allocation */ DECL_POOL(SelectionBlock, blockPool); /*! To handle selection instruction allocation */ LinearAllocator insnAllocator; /*! To handle selection vector allocation */ DECL_POOL(SelectionVector, vecPool); /*! Per register information used with top-down block sweeping */ vector regDAG; /*! Store one DAG per instruction */ vector insnDAG; /*! Owns this structure */ GenContext &ctx; /*! Tail of the code fragment for backward code generation */ intrusive_list bwdList; /*! List of emitted blocks */ intrusive_list blockList; /*! Currently processed block */ SelectionBlock *block; /*! Current instruction state to use */ GenInstructionState curr; /*! We append new registers so we duplicate the function register file */ ir::RegisterFile file; /*! State used to encode the instructions */ GenInstructionState stack[MAX_STATE_NUM]; /*! Maximum number of instructions in the basic blocks */ uint32_t maxInsnNum; /*! Speed up instruction dag allocation */ DECL_POOL(SelectionDAG, dagPool); /*! Total number of registers in the function we encode */ uint32_t regNum; /*! Number of states currently pushed */ uint32_t stateNum; /*! Number of vector allocated */ uint32_t vectorNum; /*! If true, generate code backward */ bool bwdCodeGeneration; DebugInfo DBGInfo; /*! To make function prototypes more readable */ typedef const GenRegister &Reg; /*! If true, the thread map has already been stored */ bool storeThreadMap; /*! Check for destination register. Major purpose is to find out partially updated dst registers. These registers will be unspillable. */ set partialWriteRegs; #define ALU1(OP) \ INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); } #define ALU1WithTemp(OP) \ INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); } #define ALU2(OP) \ INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); } #define ALU2WithTemp(OP) \ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); } #define ALU3(OP) \ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); } #define I64Shift(OP) \ INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); } ALU1(MOV) ALU1(READ_ARF) ALU1(LOAD_INT64_IMM) ALU1(RNDZ) ALU1(RNDE) ALU1(F16TO32) ALU1(F32TO16) ALU1WithTemp(BSWAP) ALU2(SEL) ALU2(SEL_INT64) ALU1(NOT) ALU2(AND) ALU2(OR) ALU2(XOR) ALU2(I64AND) ALU2(I64OR) ALU2(I64XOR) ALU2(SHR) ALU2(SHL) ALU2(RSR) ALU2(RSL) ALU2(ASR) ALU2(ADD) ALU2WithTemp(I64ADD) ALU2WithTemp(I64SUB) ALU2(MUL) ALU1(FRC) ALU1(RNDD) ALU1(RNDU) ALU2(MACH) ALU1(LZD) ALU3(MAD) ALU3(LRP) ALU2WithTemp(MUL_HI) ALU1(FBH) ALU1(FBL) ALU1(CBIT) ALU2WithTemp(HADD) ALU2WithTemp(RHADD) ALU2(UPSAMPLE_LONG) ALU1WithTemp(CONVI_TO_I64) ALU1WithTemp(CONVF_TO_I64) ALU1(CONVI64_TO_I) I64Shift(I64SHL) I64Shift(I64SHR) I64Shift(I64ASR) ALU1(BFREV) #undef ALU1 #undef ALU1WithTemp #undef ALU2 #undef ALU2WithTemp #undef ALU3 #undef I64Shift /*! simd shuffle */ void SIMD_SHUFFLE(Reg dst, Reg src0, Reg src1); /*! Convert 64-bit integer to 32-bit float */ void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]); /*! Convert 64-bit integer to 32-bit float */ void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]); /*! Saturated 64bit x*y + z */ void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister* tmp, int tmp_num); /*! High 64bit of x*y */ void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num); /*! (x+y)>>1 without mod. overflow */ void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num); /*! (x+y+1)>>1 without mod. overflow */ void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num); /*! Shift a 64-bit integer */ void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]); /*! Compare 64-bit integer */ void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]); /*! Saturated addition of 64-bit integer */ void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]); /*! Saturated subtraction of 64-bit integer */ void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]); /*! Encode a barrier instruction */ void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType); /*! Encode a barrier instruction */ void FENCE(GenRegister dst); /*! Encode a label instruction */ void LABEL(ir::LabelIndex label); /*! Jump indexed instruction, return the encoded instruction count according to jump distance. */ int JMPI(Reg src, ir::LabelIndex target, ir::LabelIndex origin); /*! IF indexed instruction */ void IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip); /*! ELSE indexed instruction */ void ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel); /*! ENDIF indexed instruction */ void ENDIF(Reg src, ir::LabelIndex jip, ir::LabelIndex endifLabel = ir::LabelIndex(0)); /*! WHILE indexed instruction */ void WHILE(Reg src, ir::LabelIndex jip); /*! BRD indexed instruction */ void BRD(Reg src, ir::LabelIndex jip); /*! BRC indexed instruction */ void BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip); /*! Compare instructions */ void CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst = GenRegister::null()); /*! Select instruction with embedded comparison */ void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1); /* Constant buffer move instruction */ void INDIRECT_MOVE(Reg dst, Reg tmp, Reg base, Reg regOffset, uint32_t immOffset); /*! EOT is used to finish GPGPU threads */ void EOT(void); /*! No-op */ void NOP(void); /*! Wait instruction (used for the barrier) */ void WAIT(uint32_t n = 0); /*! Atomic instruction */ void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, vector temps); /*! AtomicA64 instruction */ void ATOMICA64(Reg dst, uint32_t function, uint32_t srcNum, vector src, GenRegister bti, vector temps); /*! Read 64 bits float/int array */ void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, vector temps); /*! Write 64 bits float/int array */ void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, vector temps); /*! Read64 A64 */ void READ64A64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum); /*! write64 a64 */ void WRITE64A64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum); /*! Untyped read (up to 4 elements) */ void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, vector temps); /*! Untyped write (up to 4 elements) */ void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, vector temps); /*! Byte gather (for unaligned bytes, shorts and ints) */ void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, vector temps); /*! Byte scatter (for unaligned bytes, shorts and ints) */ void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, vector temps); /*! Byte gather a64 (for unaligned bytes, shorts and ints) */ void BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize); /*! Byte scatter (for unaligned bytes, shorts and ints) */ void BYTE_SCATTERA64(GenRegister *msg, unsigned msgNum, uint32_t elemSize); /*! Untyped read (up to 4 elements) */ void UNTYPED_READA64(Reg addr, const GenRegister *dst, uint32_t dstNum, uint32_t elemNum); /*! Untyped write (up to 4 elements) */ void UNTYPED_WRITEA64(const GenRegister *msgs, uint32_t msgNum, uint32_t elemNum); /*! DWord scatter (for constant cache read) */ void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti); /*! Unpack the uint to charN */ void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum); /*! pack the charN to uint */ void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum); /*! Unpack the uint to charN */ void UNPACK_LONG(const GenRegister dst, const GenRegister src); /*! pack the charN to uint */ void PACK_LONG(const GenRegister dst, const GenRegister src); /*! Extended math function (2 arguments) */ void MATH(Reg dst, uint32_t function, Reg src0, Reg src1); /*! Extended math function (1 argument) */ void MATH(Reg dst, uint32_t function, Reg src); /*! Encode unary instructions */ void ALU1(SelectionOpcode opcode, Reg dst, Reg src); /*! Encode unary with temp reg instructions */ void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp); /*! Encode binary instructions */ void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1); /*! Encode binary with temp reg instructions */ void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp); /*! Encode ternary instructions */ void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2); /*! Encode sample instructions */ void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform); /*! Encode vme instructions */ void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub); void IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type); /*! Encode typed write instructions */ void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D); /*! Get image information */ void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti); /*! Calculate the timestamp */ void CALC_TIMESTAMP(GenRegister ts[5], int tsN, GenRegister tmp, uint32_t pointNum, uint32_t tsType); /*! Store the profiling info */ void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum); /*! Printf */ void PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8], int srcNum, uint16_t num, bool isContinue, uint32_t totalSize); /*! Multiply 64-bit integers */ void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long); /*! 64-bit integer division */ void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int); /*! 64-bit integer remainder of division */ void I64REM(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int); /*! double division */ void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum); /*! Work Group Operations */ void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister localThreadID, GenRegister localThreadNUM, GenRegister tmpData2, GenRegister slmOff, vector msg, GenRegister localBarrier); /*! Sub Group Operations */ void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister tmpData2); /*! Oblock read */ void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size); /*! Oblock write */ void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size); /*! Media block read */ void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size); /*! Media block write */ void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size); /* common functions for both binary instruction and sel_cmp and compare instruction. It will handle the IMM or normal register assignment, and will try to avoid LOADI as much as possible. */ void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0, GenRegister &src1, ir::Type type, bool &inverse); void getSrcGenRegImm(SelectionDAG &dag, SelectionDAG *dag0, SelectionDAG *dag1, GenRegister &src0, GenRegister &src1, ir::Type type, bool &inverse); /* Get current block IP register according to label width. */ GenRegister getBlockIP() { return ctx.isDWLabel() ? selReg(ir::ocl::dwblockip) : selReg(ir::ocl::blockip); } /* Get proper label immediate gen register from label value. */ GenRegister getLabelImmReg(uint32_t labelValue) { return ctx.isDWLabel() ? GenRegister::immud(labelValue) : GenRegister::immuw(labelValue); } /* Get proper label immediate gen register from label. */ GenRegister getLabelImmReg(ir::LabelIndex label) { return getLabelImmReg(label.value()); } /* Set current label register to a label value. */ void setBlockIP(GenRegister blockip, uint32_t labelValue) { if (!ctx.isDWLabel()) MOV(GenRegister::retype(blockip, GEN_TYPE_UW), GenRegister::immuw(labelValue)); else MOV(GenRegister::retype(blockip, GEN_TYPE_UD), GenRegister::immud(labelValue)); } /* Generate comparison instruction to compare block ip address and specified label register.*/ void cmpBlockIP(uint32_t cond, GenRegister blockip, GenRegister labelReg) { if (!ctx.isDWLabel()) CMP(cond, GenRegister::retype(blockip, GEN_TYPE_UW), labelReg, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); else CMP(cond, GenRegister::retype(blockip, GEN_TYPE_UD), labelReg, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); } void cmpBlockIP(uint32_t cond, GenRegister blockip, uint32_t labelValue) { if (!ctx.isDWLabel()) CMP(cond, GenRegister::retype(blockip, GEN_TYPE_UW), GenRegister::immuw(labelValue), GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); else CMP(cond, GenRegister::retype(blockip, GEN_TYPE_UD), GenRegister::immud(labelValue), GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); } INLINE vector getBTITemps(const ir::AddressMode &AM) { vector temps; if (AM == ir::AM_DynamicBti) { temps.push_back(selReg(reg(ir::FAMILY_WORD, true), ir::TYPE_U16)); temps.push_back(selReg(reg(ir::FAMILY_DWORD, true), ir::TYPE_U32)); } return temps; } /*! Use custom allocators */ GBE_CLASS(Opaque); friend class SelectionBlock; friend class SelectionInstruction; private: /*! Auxiliary label for if/endif. */ uint32_t currAuxLabel; bool bHas32X32Mul; bool bHasLongType; bool bHasDoubleType; bool bHasHalfType; bool bLongRegRestrict; bool bHasSends; uint32_t ldMsgOrder; bool slowByteGather; INLINE ir::LabelIndex newAuxLabel() { currAuxLabel++; return (ir::LabelIndex)currAuxLabel; } }; /////////////////////////////////////////////////////////////////////////// // Helper function /////////////////////////////////////////////////////////////////////////// /*! Directly mark all sources as root (when no match is found) */ static void markAllChildren(SelectionDAG &dag) { // Do not merge anything, so all sources become roots for (uint32_t childID = 0; childID < dag.childNum; ++childID) if (dag.child[childID]) dag.child[childID]->isRoot = 1; } /*! Helper function to figure if two sources are the same */ static bool sourceMatch(SelectionDAG *src0DAG, uint32_t src0ID, SelectionDAG *src1DAG, uint32_t src1ID) { GBE_ASSERT(src0DAG && src1DAG); // Ensure they are the same physical registers const ir::Register src0 = src0DAG->insn.getSrc(src0ID); const ir::Register src1 = src1DAG->insn.getSrc(src1ID); if (src0 != src1) return false; // Ensure they contain the same values return src0DAG->child[src0ID] == src1DAG->child[src1ID]; } Selection::Opaque::Opaque(GenContext &ctx) : ctx(ctx), block(NULL), curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()), maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum), stateNum(0), vectorNum(0), bwdCodeGeneration(false), storeThreadMap(false), currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false), bHasSends(false), ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false) { const ir::Function &fn = ctx.getFunction(); this->regNum = fn.regNum(); this->regDAG.resize(regNum); this->insnDAG.resize(maxInsnNum); } Selection::Opaque::~Opaque(void) { for (auto it = blockList.begin(); it != blockList.end();) { SelectionBlock &block = *it; ++it; this->deleteSelectionBlock(&block); } } SelectionInstruction* Selection::Opaque::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) { const size_t regSize = (dstNum+srcNum)*sizeof(GenRegister); const size_t size = sizeof(SelectionInstruction) + regSize; void *ptr = insnAllocator.allocate(size); return new (ptr) SelectionInstruction(opcode, dstNum, srcNum); } void Selection::Opaque::startBackwardGeneration(void) { this->bwdCodeGeneration = true; } void Selection::Opaque::endBackwardGeneration(void) { for (auto it = bwdList.rbegin(); it != bwdList.rend();) { SelectionInstruction &insn = *it; auto toRemoveIt = it--; bwdList.erase(toRemoveIt); this->block->prepend(&insn); } this->bwdCodeGeneration = false; } uint32_t Selection::Opaque::getLargestBlockSize(void) const { size_t maxInsnNum = 0; for (const auto &bb : blockList) maxInsnNum = std::max(maxInsnNum, bb.insnList.size()); return uint32_t(maxInsnNum); } void Selection::Opaque::appendBlock(const ir::BasicBlock &bb) { this->block = this->newSelectionBlock(&bb); this->blockList.push_back(this->block); } SelectionInstruction *Selection::Opaque::appendInsn(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) { GBE_ASSERT(dstNum <= SelectionInstruction::MAX_DST_NUM && srcNum <= SelectionInstruction::MAX_SRC_NUM); GBE_ASSERT(this->block != NULL); SelectionInstruction *insn = this->create(opcode, dstNum, srcNum); insn->setDBGInfo(DBGInfo); if (this->bwdCodeGeneration) this->bwdList.push_back(insn); else this->block->append(insn); insn->state = this->curr; return insn; } SelectionVector *Selection::Opaque::appendVector(void) { GBE_ASSERT(this->block != NULL); SelectionVector *vector = this->newSelectionVector(); if (this->bwdCodeGeneration) vector->insn = this->bwdList.back(); else vector->insn = this->block->insnList.back(); this->block->append(vector); this->vectorNum++; return vector; } bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) { GBE_ASSERT(registerPool != 0); struct SpillReg { uint32_t type:4; //!< Gen type uint32_t vstride:4; //!< Vertical stride uint32_t width:3; //!< Width uint32_t hstride:2; //!< Horizontal stride }; map SpillRegs; for (auto &block : blockList) for (auto &insn : block.insnList) { // spill / unspill insn should be skipped when do spilling if(insn.opcode == SEL_OP_SPILL_REG || insn.opcode == SEL_OP_UNSPILL_REG) continue; const int simdWidth = insn.state.execWidth; const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum; struct RegSlot { RegSlot(ir::Register _reg, uint8_t _srcID, uint8_t _poolOffset, bool _isTmp, uint32_t _addr) : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr) {}; ir::Register reg; union { uint8_t srcID; uint8_t dstID; }; uint8_t poolOffset; bool isTmpReg; int32_t addr; }; uint8_t poolOffset = 1; // keep one for scratch message header vector regSet; for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const GenRegister selReg = insn.src(srcID); const ir::Register reg = selReg.reg(); auto it = spilledRegs.find(reg); if(it != spilledRegs.end() && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) { ir::RegisterFamily family = getRegisterFamily(reg); if(family == ir::FAMILY_QWORD && poolOffset == 1) { poolOffset += simdWidth / 8; // qword register fill could not share the scratch read message payload register } struct RegSlot regSlot(reg, srcID, poolOffset, it->second.isTmpReg, it->second.addr); if(family == ir::FAMILY_QWORD) { poolOffset += 2 * simdWidth / 8; } else { poolOffset += simdWidth / 8; } regSet.push_back(regSlot); } } if (poolOffset > ctx.reservedSpillRegs) return false; // FIXME, to support post register allocation scheduling, // put all the reserved register to the spill/unspill's destination registers. // This is not the best way. We need to refine the spill/unspill instruction to // only use passed in registers and don't access hard coded offset in the future. while(!regSet.empty()) { struct RegSlot regSlot = regSet.back(); regSet.pop_back(); const GenRegister selReg = insn.src(regSlot.srcID); if (!regSlot.isTmpReg) { /* For temporary registers, we don't need to unspill. */ SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(), 0); unspill->state = GenInstructionState(simdWidth); unspill->state.noMask = 1; auto it = SpillRegs.find(selReg.value.reg); GenRegister dst0; if( it != SpillRegs.end()) { dst0 = GenRegister(GEN_GENERAL_REGISTER_FILE, registerPool + regSlot.poolOffset, 0, it->second.type, it->second.vstride, it->second.width, it->second.hstride); } else { dst0 = GenRegister(GEN_GENERAL_REGISTER_FILE, registerPool + regSlot.poolOffset, 0, selReg.type, selReg.vstride, selReg.width, selReg.hstride); } dst0.value.reg = selReg.value.reg; unspill->dst(0) = dst0; for(uint32_t i = 1; i < 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++) unspill->dst(i) = ctx.getSimdWidth() == 8 ? GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1), 0 ) : GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1) * 2, 0); unspill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth; unspill->extra.scratchMsgHeader = registerPool; insn.prepend(*unspill); } GenRegister src = insn.src(regSlot.srcID); // change nr/subnr, keep other register settings src.nr = registerPool + regSlot.poolOffset + src.nr; src.physical = 1; insn.src(regSlot.srcID) = src; }; /* To save one register, registerPool + 1 was used by both the src0 as source and other operands as payload. To avoid side effect, we use a stack model to push all operands register, and spill the 0th dest at last. As all the spill will be append to the current instruction. Then the last spill instruction will be the first instruction after current instruction. Thus the registerPool + 1 still contain valid data. */ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const GenRegister selReg = insn.dst(dstID); const ir::Register reg = selReg.reg(); auto it = spilledRegs.find(reg); if(it != spilledRegs.end() && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) { ir::RegisterFamily family = getRegisterFamily(reg); if(family == ir::FAMILY_QWORD && poolOffset == 1) { poolOffset += simdWidth / 8; // qword register spill could not share the scratch write message payload register } struct RegSlot regSlot(reg, dstID, poolOffset, it->second.isTmpReg, it->second.addr); if (family == ir::FAMILY_QWORD) poolOffset += 2 * simdWidth / 8; else poolOffset += simdWidth / 8; regSet.push_back(regSlot); } } if (poolOffset > ctx.reservedSpillRegs) return false; while(!regSet.empty()) { struct RegSlot regSlot = regSet.back(); regSet.pop_back(); const GenRegister selReg = insn.dst(regSlot.dstID); if(!regSlot.isTmpReg) { /* For temporary registers, we don't need to unspill. */ SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth() , 1); spill->state = insn.state;//GenInstructionState(simdWidth); spill->state.accWrEnable = 0; spill->state.saturate = 0; // Store the spilled regiter type. struct SpillReg tmp; tmp.type = selReg.type; tmp.vstride = selReg.vstride; tmp.hstride = selReg.hstride; tmp.width= selReg.width; SpillRegs[selReg.value.reg] = tmp; if (insn.opcode == SEL_OP_SEL) spill->state.predicate = GEN_PREDICATE_NONE; spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, registerPool + regSlot.poolOffset, 0, selReg.type, selReg.vstride, selReg.width, selReg.hstride); spill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth; spill->extra.scratchMsgHeader = registerPool; for(uint32_t i = 0; i < 0 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++) spill->dst(i) = ctx.getSimdWidth() == 8 ? GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i), 0 ) : GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i) * 2, 0); insn.append(*spill); } GenRegister dst = insn.dst(regSlot.dstID); // change nr/subnr, keep other register settings dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0; insn.dst(regSlot.dstID)= dst; } } return true; } ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) { SelectionBlock *block = insn->parent; const uint32_t simdWidth = insn->state.execWidth; ir::Register tmp; GenRegister gr; // This will append the temporary register in the instruction block this->block = block; tmp = this->reg(ir::getFamily(type), simdWidth == 1); gr = this->selReg(tmp, type); if (needMov) { // Generate the MOV instruction and replace the register in the instruction SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1); mov->src(0) = GenRegister::retype(insn->src(regID), gr.type); mov->state = GenInstructionState(simdWidth); if (this->isScalarReg(insn->src(regID).reg())) mov->state.noMask = 1; mov->dst(0) = gr; insn->prepend(*mov); } insn->src(regID) = gr; return tmp; } ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) { SelectionBlock *block = insn->parent; uint32_t simdWidth; if (!GenRegister::isNull(insn->dst(regID))) simdWidth = this->isScalarReg(insn->dst(regID).reg()) ? 1 : insn->state.execWidth; else { GBE_ASSERT(needMov == false); simdWidth = insn->state.execWidth; } ir::Register tmp; GenRegister gr; this->block = block; tmp = this->reg(ir::getFamily(type)); gr = this->selReg(tmp, type); if (needMov) { // Generate the MOV instruction and replace the register in the instruction SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1); mov->dst(0) = GenRegister::retype(insn->dst(regID), gr.type); mov->state = GenInstructionState(simdWidth); if (simdWidth == 1) { mov->state.noMask = 1; mov->src(0) = GenRegister::retype(GenRegister::vec1(GEN_GENERAL_REGISTER_FILE, gr.reg()), gr.type); } else mov->src(0) = gr; insn->append(*mov); } insn->dst(regID) = gr; return tmp; } #define SEL_REG(SIMD16, SIMD8, SIMD1) \ if (ctx.sel->isScalarReg(reg) == true) \ return GenRegister::retype(GenRegister::SIMD1(reg), genType); \ else if (simdWidth == 8) \ return GenRegister::retype(GenRegister::SIMD8(reg), genType); \ else { \ GBE_ASSERT (simdWidth == 16); \ return GenRegister::retype(GenRegister::SIMD16(reg), genType); \ } GenRegister Selection::Opaque::selReg(ir::Register reg, ir::Type type) const { using namespace ir; const uint32_t genType = getGenType(type); const uint32_t simdWidth = ctx.getSimdWidth(); const RegisterData data = file.get(reg); const RegisterFamily family = data.family; switch (family) { case FAMILY_BOOL: SEL_REG(uw16grf, uw8grf, uw1grf); break; case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break; case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break; case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break; case FAMILY_QWORD: if (!this->hasLongType()) { SEL_REG(ud16grf, ud8grf, ud1grf); } else { SEL_REG(ul16grf, ul8grf, ul1grf); } break; default: NOT_SUPPORTED; } GBE_ASSERT(false); return GenRegister(); } #undef SEL_REG GenRegister Selection::Opaque::selRegQn(ir::Register reg, uint32_t q, ir::Type type) const { GenRegister sreg = this->selReg(reg, type); sreg.quarter = q; return sreg; } /*! Syntactic sugar for method declaration */ typedef const GenRegister &Reg; void Selection::Opaque::LABEL(ir::LabelIndex index) { SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0); insn->index = index.value(); } void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) { SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1); insn->src(0) = src; insn->dst(0) = fence; insn->extra.barrierType = barrierType; } void Selection::Opaque::FENCE(GenRegister dst) { SelectionInstruction *insn = this->appendInsn(SEL_OP_FENCE, 1, 0); insn->dst(0) = dst; } int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) { SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1); insn->src(0) = src; insn->index = index.value(); ir::LabelIndex start, end; if (origin.value() < index.value()) { // Forward Jump, need to exclude the target BB. Because we // need to jump to the beginning of it. start = origin; end = ir::LabelIndex(index.value() - 1); } else { start = index; end = origin; } // FIXME, this longjmp check is too hacky. We need to support instruction // insertion at code emission stage in the future. insn->extra.longjmp = ctx.getFunction().getDistance(start, end) > 3000; return insn->extra.longjmp ? 2 : 1; } void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) { SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1); insn->src(0) = src; insn->index = jip.value(); } void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) { SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1); insn->src(0) = src; insn->index = jip.value(); insn->index1 = uip.value(); } void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) { SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1); insn->src(0) = src; insn->index = jip.value(); insn->index1 = uip.value(); } void Selection::Opaque::ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel) { SelectionInstruction *insn = this->appendInsn(SEL_OP_ELSE, 0, 1); insn->src(0) = src; insn->index = jip.value(); this->LABEL(elseLabel); } void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip, ir::LabelIndex endifLabel) { if(endifLabel == 0) this->block->endifLabel = this->newAuxLabel(); else this->block->endifLabel = endifLabel; this->LABEL(this->block->endifLabel); SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1); insn->src(0) = src; insn->index = this->block->endifLabel.value(); } void Selection::Opaque::WHILE(Reg src, ir::LabelIndex jip) { SelectionInstruction *insn = this->appendInsn(SEL_OP_WHILE, 0, 1); insn->src(0) = src; insn->index = jip.value(); } void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) { SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 1, 2); insn->src(0) = src0; insn->src(1) = src1; insn->dst(0) = dst; insn->extra.function = conditional; } void Selection::Opaque::SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1) { SelectionInstruction *insn = this->appendInsn(SEL_OP_SEL_CMP, 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; insn->extra.function = conditional; } void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg tmp, Reg base, Reg regOffset, uint32_t immOffset) { SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 2, 2); insn->dst(0) = dst; insn->dst(1) = tmp; insn->src(0) = base; insn->src(1) = regOffset; insn->extra.indirect_offset = immOffset; } void Selection::Opaque::ATOMIC(Reg dst, uint32_t function, uint32_t msgPayload, Reg src0, Reg src1, Reg src2, GenRegister bti, vector temps) { unsigned dstNum = 1 + temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, msgPayload + 1); if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } insn->dst(0) = dst; if(temps.size()) { insn->dst(1) = temps[0]; insn->dst(2) = temps[1]; } insn->src(0) = src0; if(msgPayload > 1) insn->src(1) = src1; if(msgPayload > 2) insn->src(2) = src2; insn->src(msgPayload) = bti; insn->extra.function = function; insn->extra.elem = msgPayload; if (hasSends() && msgPayload > 1) { insn->extra.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->offsetID = 0; vector->reg = &insn->src(0); vector->isSrc = 1; vector = this->appendVector(); vector->regNum = msgPayload - 1; vector->offsetID = 1; vector->reg = &insn->src(1); vector->isSrc = 1; } else { SelectionVector *vector = this->appendVector(); vector->regNum = msgPayload; //bti not included in SelectionVector vector->offsetID = 0; vector->reg = &insn->src(0); vector->isSrc = 1; } } void Selection::Opaque::ATOMICA64(Reg dst, uint32_t function, uint32_t msgPayload, vector src, GenRegister bti, vector temps) { unsigned dstNum = 1 + temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMICA64, dstNum, msgPayload + 1); insn->dst(0) = dst; if(temps.size()) { insn->dst(1) = temps[0]; insn->dst(2) = temps[1]; } for (uint32_t elemID = 0; elemID < msgPayload; ++elemID) insn->src(elemID) = src[elemID]; insn->src(msgPayload) = bti; insn->extra.function = function; insn->extra.elem = msgPayload; SelectionVector *vector = this->appendVector(); vector->regNum = msgPayload; //bti not included in SelectionVector vector->offsetID = 0; vector->reg = &insn->src(0); vector->isSrc = 1; } void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); } void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); } void Selection::Opaque::WAIT(uint32_t n) { SelectionInstruction *insn = this->appendInsn(SEL_OP_WAIT, 0, 0); insn->extra.waitType = n; } void Selection::Opaque::READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, vector temps) { SelectionInstruction *insn = NULL; SelectionVector *srcVector = NULL; SelectionVector *dstVector = NULL; if (!native_long) { unsigned dstNum = elemNum + temps.size(); insn = this->appendInsn(SEL_OP_READ64, dstNum, 2); srcVector = this->appendVector(); dstVector = this->appendVector(); // Regular instruction to encode for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID) = dst[elemID]; // flagTemp don't need to be put in SelectionVector if (temps.size()) { insn->dst(elemNum) = temps[0]; insn->dst(elemNum + 1) = temps[1]; } } else { unsigned dstNum = elemNum*2 + temps.size(); insn = this->appendInsn(SEL_OP_READ64, dstNum, 2); srcVector = this->appendVector(); dstVector = this->appendVector(); for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID) = tmp[elemID]; for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID + elemNum) = dst[elemID]; // flagTemp don't need to be put in SelectionVector if (temps.size()) { insn->dst(2*elemNum) = temps[0]; insn->dst(2*elemNum + 1) = temps[1]; } } if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } insn->src(0) = addr; insn->src(1) = bti; insn->extra.elem = elemNum; dstVector->regNum = elemNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->offsetID = 0; srcVector->isSrc = 1; srcVector->reg = &insn->src(0); } void Selection::Opaque::READ64A64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum) { SelectionInstruction *insn = NULL; SelectionVector *srcVector = NULL; SelectionVector *dstVector = NULL; insn = this->appendInsn(SEL_OP_READ64A64,elemNum*2, 1); srcVector = this->appendVector(); dstVector = this->appendVector(); for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID) = tmp[elemID]; for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID + elemNum) = dst[elemID]; insn->src(0) = addr; insn->extra.elem = elemNum; dstVector->regNum = elemNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->offsetID = 0; srcVector->isSrc = 1; srcVector->reg = &insn->src(0); } void Selection::Opaque::UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, vector temps) { unsigned dstNum = elemNum + temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2); SelectionVector *srcVector = this->appendVector(); SelectionVector *dstVector = this->appendVector(); if (this->isScalarReg(dst[0].reg())) insn->state.noMask = 1; // Regular instruction to encode for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->dst(elemID) = dst[elemID]; if (temps.size()) { insn->dst(elemNum) = temps[0]; insn->dst(elemNum + 1) = temps[1]; } insn->src(0) = addr; insn->src(1) = bti; if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } insn->extra.elem = elemNum; // Sends require contiguous allocation dstVector->regNum = elemNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->isSrc = 1; srcVector->offsetID = 0; srcVector->reg = &insn->src(0); } void Selection::Opaque::UNTYPED_READA64(Reg addr, const GenRegister *dst, uint32_t dstNum, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READA64, dstNum, 1); SelectionVector *srcVector = this->appendVector(); SelectionVector *dstVector = this->appendVector(); if (this->isScalarReg(dst[0].reg())) insn->state.noMask = 1; // Regular instruction to encode for (uint32_t id = 0; id < dstNum; ++id) insn->dst(id) = dst[id]; insn->src(0) = addr; insn->extra.elem = elemNum; // Sends require contiguous allocation dstVector->regNum = dstNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->isSrc = 1; srcVector->offsetID = 0; srcVector->reg = &insn->src(0); } void Selection::Opaque::WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, vector temps) { SelectionVector *vector = NULL; SelectionInstruction *insn = NULL; if (!native_long) { unsigned dstNum = temps.size(); insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2); vector = this->appendVector(); // Register layout: // dst: (flagTemp) // src: addr, srcNum, bti insn->src(0) = addr; for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(elemID + 1) = src[elemID]; insn->src(srcNum+1) = bti; if (temps.size()) { insn->dst(0) = temps[0]; insn->dst(1) = temps[1]; } insn->extra.elem = srcNum; vector->regNum = srcNum + 1; vector->offsetID = 0; vector->reg = &insn->src(0); vector->isSrc = 1; } else { // handle the native long case unsigned dstNum = srcNum + temps.size(); // Register layout: // dst: srcNum, (flagTemp) // src: srcNum, addr, srcNum, bti. insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2); for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(elemID) = src[elemID]; insn->src(srcNum) = addr; for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(srcNum + 1 + elemID) = tmp[0]; insn->src(srcNum*2+1) = bti; /* We also need to add the tmp reigster to dst, in order to avoid the post schedule error . */ for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->dst(elemID) = tmp[0]; if (temps.size()) { insn->dst(srcNum) = temps[0]; insn->dst(srcNum + 1) = temps[1]; } insn->extra.elem = srcNum; if (hasSends()) { insn->extra.splitSend = 1; //addr regs vector = this->appendVector(); vector->regNum = 1; vector->offsetID = srcNum; vector->reg = &insn->src(srcNum); vector->isSrc = 1; //data regs vector = this->appendVector(); vector->regNum = srcNum; vector->offsetID = srcNum+1; vector->reg = &insn->src(srcNum+1); vector->isSrc = 1; } else { vector = this->appendVector(); vector->regNum = srcNum + 1; vector->offsetID = srcNum; vector->reg = &insn->src(srcNum); vector->isSrc = 1; } } if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } } void Selection::Opaque::WRITE64A64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum) { SelectionVector *vector = NULL; SelectionInstruction *insn = NULL; const uint32_t dstNum = srcNum; insn = this->appendInsn(SEL_OP_WRITE64A64, dstNum, srcNum*2 + 1); vector = this->appendVector(); for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(elemID) = src[elemID]; insn->src(srcNum) = addr; for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(srcNum + 1 + elemID) = tmp[elemID]; /* We also need to add the tmp reigster to dst, in order to avoid the post schedule error . */ for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->dst(elemID) = tmp[elemID]; insn->extra.elem = srcNum; vector->regNum = srcNum + 1; vector->offsetID = srcNum; vector->reg = &insn->src(srcNum); vector->isSrc = 1; } void Selection::Opaque::UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, vector temps) { unsigned dstNum = temps.size(); unsigned srcNum = elemNum + 2 + temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, srcNum); if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } // Regular instruction to encode insn->src(0) = addr; for (uint32_t elemID = 0; elemID < elemNum; ++elemID) insn->src(elemID+1) = src[elemID]; insn->src(elemNum+1) = bti; if (temps.size()) { insn->dst(0) = temps[0]; insn->dst(1) = temps[1]; insn->src(elemNum + 2) = temps[0]; insn->src(elemNum + 3) = temps[1]; } insn->extra.elem = elemNum; if (hasSends()) { insn->extra.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = elemNum; vector->reg = &insn->src(1); vector->offsetID = 1; vector->isSrc = 1; vector = this->appendVector(); vector->regNum = 1; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; } else { // Sends require contiguous allocation for the sources SelectionVector *vector = this->appendVector(); vector->regNum = elemNum+1; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; } } void Selection::Opaque::UNTYPED_WRITEA64(const GenRegister *src, uint32_t msgNum, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITEA64, 0, msgNum); SelectionVector *vector = this->appendVector(); // Regular instruction to encode for (uint32_t id = 0; id < msgNum; ++id) insn->src(id) = src[id]; insn->extra.elem = elemNum; // Sends require contiguous allocation for the sources vector->regNum = msgNum; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; } void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, vector temps) { unsigned dstNum = 1 + temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2); SelectionVector *srcVector = this->appendVector(); SelectionVector *dstVector = this->appendVector(); if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } if (this->isScalarReg(dst.reg())) insn->state.noMask = 1; // Instruction to encode insn->src(0) = addr; insn->src(1) = bti; insn->dst(0) = dst; if (temps.size()) { insn->dst(1) = temps[0]; insn->dst(2) = temps[1]; } insn->extra.elem = elemSize; // byte gather requires vector in the sense that scalar are not allowed // (yet) dstVector->regNum = 1; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->isSrc = 1; srcVector->offsetID = 0; srcVector->reg = &insn->src(0); } void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, vector temps) { unsigned dstNum = temps.size(); SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3); if (bti.file != GEN_IMMEDIATE_VALUE) { insn->state.flag = 0; insn->state.subFlag = 1; } if (temps.size()) { insn->dst(0) = temps[0]; insn->dst(1) = temps[1]; } // Instruction to encode insn->src(0) = addr; insn->src(1) = src; insn->src(2) = bti; insn->extra.elem = elemSize; if (hasSends()) { insn->extra.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->isSrc = 1; vector->offsetID = 0; vector->reg = &insn->src(0); vector = this->appendVector(); vector->regNum = 1; vector->isSrc = 1; vector->offsetID = 1; vector->reg = &insn->src(1); } else { SelectionVector *vector = this->appendVector(); vector->regNum = 2; vector->isSrc = 1; vector->offsetID = 0; vector->reg = &insn->src(0); } } void Selection::Opaque::BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize) { SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHERA64, 1, 1); SelectionVector *srcVector = this->appendVector(); SelectionVector *dstVector = this->appendVector(); if (this->isScalarReg(dst.reg())) insn->state.noMask = 1; insn->src(0) = addr; insn->dst(0) = dst; insn->extra.elem = elemSize; dstVector->regNum = 1; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->isSrc = 1; srcVector->offsetID = 0; srcVector->reg = &insn->src(0); } void Selection::Opaque::BYTE_SCATTERA64(GenRegister *msg, uint32_t msgNum, uint32_t elemSize) { SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTERA64, 0, msgNum); SelectionVector *vector = this->appendVector(); for (unsigned i = 0; i < msgNum; i++) insn->src(i) = msg[i]; insn->extra.elem = elemSize; vector->regNum = msgNum; vector->isSrc = 1; vector->offsetID = 0; vector->reg = &insn->src(0); } void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) { SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1); SelectionVector *vector = this->appendVector(); SelectionVector *srcVector = this->appendVector(); if (this->isScalarReg(dst.reg())) insn->state.noMask = 1; insn->src(0) = addr; insn->dst(0) = dst; insn->setbti(bti); vector->regNum = 1; vector->isSrc = 0; vector->offsetID = 0; vector->reg = &insn->dst(0); srcVector->regNum = 1; srcVector->isSrc = 1; srcVector->offsetID = 0; srcVector->reg = &insn->src(0); } void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1); insn->src(0) = src; insn->extra.elem = 4 / elemSize; for(uint32_t i = 0; i < elemNum; i++) insn->dst(i) = dst[i]; } void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum); for(uint32_t i = 0; i < elemNum; i++) insn->src(i) = src[i]; insn->extra.elem = 4 / elemSize; insn->dst(0) = dst; } void Selection::Opaque::UNPACK_LONG(const GenRegister dst, const GenRegister src) { SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_LONG, 1, 1); insn->src(0) = src; insn->dst(0) = dst; } void Selection::Opaque::PACK_LONG(const GenRegister dst, const GenRegister src) { SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_LONG, 1, 1); insn->src(0) = src; insn->dst(0) = dst; } void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) { SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; insn->extra.function = function; } void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src) { SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 1); insn->dst(0) = dst; insn->src(0) = src; insn->extra.function = function; } void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long) { SelectionInstruction *insn = NULL; if (native_long) insn = this->appendInsn(SEL_OP_I64MUL, 2, 2); else insn = this->appendInsn(SEL_OP_I64MUL, 7, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; if (native_long) { insn->dst(1) = tmp[0]; } else { for (int i = 0; i < 6; i++) insn->dst(i + 1) = tmp[i]; } } void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, tmp_num + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmp_num; i++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, tmp_num + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmp_num; i++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_F64DIV, tmpNum + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmpNum; i++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) { SelectionInstruction *insn = this->appendInsn(opcode, 1, 1); insn->dst(0) = dst; insn->src(0) = src; } void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) { SelectionInstruction *insn = this->appendInsn(opcode, 2, 1); insn->dst(0) = dst; insn->src(0) = src; insn->dst(1) = temp; } void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) { SelectionInstruction *insn = this->appendInsn(opcode, 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; } void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) { SelectionInstruction *insn = this->appendInsn(opcode, 2, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; insn->dst(1) = temp; } void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) { SelectionInstruction *insn = this->appendInsn(opcode, 1, 3); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; insn->src(2) = src2; } void Selection::Opaque::SIMD_SHUFFLE(Reg dst, Reg src0, Reg src1) { SelectionInstruction *insn = this->appendInsn(SEL_OP_SIMD_SHUFFLE, 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; } GenRegister Selection::Opaque::getLaneIDReg() { const GenRegister laneID = GenRegister::immv(0x76543210); GenRegister dst; uint32_t execWidth = curr.execWidth; if (execWidth == 8) { // Work around to force the register 32 alignmet dst = selReg(reg(ir::RegisterFamily::FAMILY_DWORD), ir::TYPE_U16); MOV(dst, laneID); } else { dst = selReg(reg(ir::RegisterFamily::FAMILY_WORD), ir::TYPE_U16); push(); curr.execWidth = 8; curr.noMask = 1; MOV(dst, laneID); //Packed Unsigned Half-Byte Integer Vector does not work //have to mock by adding 8 to the singed vector const GenRegister eight = GenRegister::immuw(8); ADD(GenRegister::offset(dst, 0, 16), dst, eight); pop(); } return dst; } void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2); insn->src(0) = src0; insn->src(1) = src1; for(int i=0; i<3; i++) insn->dst(i) = tmp[i]; insn->extra.function = conditional; } void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 6, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i=0; i<5; i++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 6, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i=0; i<5; i++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]) { SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 7, 1); insn->dst(0) = dst; insn->src(0) = src; for(int i = 0; i < 6; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]) { SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 3, 1); insn->dst(0) = dst; insn->src(0) = src; for(int i = 0; i < 2; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister *tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, tmp_num + 1, 3); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; insn->src(2) = src2; for(int i = 0; i < tmp_num; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, tmp_num + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmp_num; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, tmp_num + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmp_num; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) { SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, tmp_num + 1, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < tmp_num; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { SelectionInstruction *insn = this->appendInsn(opcode, 7, 2); insn->dst(0) = dst; insn->src(0) = src0; insn->src(1) = src1; for(int i = 0; i < 6; i ++) insn->dst(i + 1) = tmp[i]; } void Selection::Opaque::CALC_TIMESTAMP(GenRegister ts[5], int tsN, GenRegister tmp, uint32_t pointNum, uint32_t tsType) { SelectionInstruction *insn = NULL; if (!this->hasLongType()) { insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN + 1, tsN); } else {// No need for tmp insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN, tsN); } for (int i = 0; i < tsN; i++) { insn->src(i) = ts[i]; insn->dst(i) = ts[i]; } if (!this->hasLongType()) insn->dst(tsN) = tmp; insn->extra.pointNum = static_cast(pointNum); insn->extra.timestampType = static_cast(tsType); } void Selection::Opaque::STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum) { if (tsNum == 3) { // SIMD16 mode SelectionInstruction *insn = this->appendInsn(SEL_OP_STORE_PROFILING, 1, 3); for (int i = 0; i < 3; i++) insn->src(i) = ts[i]; insn->dst(0) = tmp0; insn->extra.profilingType = static_cast(profilingType); insn->extra.profilingBTI = static_cast(bti); } else { // SIMD8 mode GBE_ASSERT(tsNum == 5); SelectionInstruction *insn = this->appendInsn(SEL_OP_STORE_PROFILING, 2, 5); SelectionVector *dstVector = this->appendVector(); for (int i = 0; i < 5; i++) insn->src(i) = ts[i]; insn->dst(0) = tmp0; insn->dst(1) = tmp1; dstVector->regNum = 2; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); insn->extra.profilingType = static_cast(profilingType); insn->extra.profilingBTI = static_cast(bti); } } void Selection::Opaque::PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8], int srcNum, uint16_t num, bool isContinue, uint32_t totalSize) { SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 2, srcNum); for (int i = 0; i < srcNum; i++) insn->src(i) = src[i]; insn->dst(0) = tmp0; insn->dst(1) = tmp1; if (hasSends()) { insn->extra.printfSplitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->reg = &insn->dst(0); vector->offsetID = 0; vector->isSrc = 0; vector = this->appendVector(); vector->regNum = 1; vector->reg = &insn->dst(1); vector->offsetID = 1; vector->isSrc = 0; } else { SelectionVector *vector = this->appendVector(); vector->regNum = 2; vector->reg = &insn->dst(0); vector->offsetID = 0; vector->isSrc = 0; } insn->extra.printfSize = static_cast(totalSize); insn->extra.continueFlag = isContinue; insn->extra.printfBTI = bti; insn->extra.printfNum = num; } void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister localThreadID, GenRegister localThreadNUM, GenRegister tmpData2, GenRegister slmOff, vector msg, GenRegister localBarrier) { SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6); insn->extra.wgop.workgroupOp = wg_op; insn->dst(0) = dst; insn->dst(1) = tmpData1; for(uint32_t i = 0; i < msg.size(); i++) insn->dst(2 + i) = msg[i]; insn->src(0) = localThreadID; insn->src(1) = localThreadNUM; insn->src(2) = src; insn->src(3) = tmpData2; insn->src(4) = slmOff; insn->src(5) = localBarrier; if (hasSends()) { insn->extra.wgop.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->offsetID = 2; vector->reg = &insn->dst(2); vector->isSrc = 0; vector = this->appendVector(); vector->regNum = msg.size() - 1; vector->offsetID = 3; vector->reg = &insn->dst(3); vector->isSrc = 0; } else { /* allocate continuous GRF registers for READ/WRITE to SLM */ SelectionVector *vector = this->appendVector(); vector->regNum = msg.size(); vector->offsetID = 2; vector->reg = &insn->dst(2); vector->isSrc = 0; } } void Selection::Opaque::SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister tmpData2) { SelectionInstruction *insn = this->appendInsn(SEL_OP_SUBGROUP_OP, 2, 2); insn->extra.wgop.workgroupOp = wg_op; insn->dst(0) = dst; insn->dst(1) = tmpData1; insn->src(0) = src; insn->src(1) = tmpData2; } void Selection::Opaque::OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister header, uint32_t bti, uint32_t ow_size) { SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1); SelectionVector *vector = this->appendVector(); insn->src(0) = header; for (uint32_t i = 0; i < vec_size; ++i) insn->dst(i) = dsts[i]; insn->setbti(bti); insn->extra.elem = ow_size; // number of OWord size // tmp regs for OWORD read dst vector->regNum = vec_size; vector->reg = &insn->dst(0); vector->offsetID = 0; vector->isSrc = 0; } void Selection::Opaque::OBWRITE(GenRegister header, GenRegister* values, uint32_t vec_size, uint32_t bti, uint32_t ow_size) { SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1); insn->src(0) = header; for (uint32_t i = 0; i < vec_size; ++i) insn->src(i + 1) = values[i]; insn->setbti(bti); insn->extra.elem = ow_size; // number of OWord_size // For A64 write, we did not add sends support yet. if (hasSends() && bti != 255) { insn->extra.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; vector = this->appendVector(); vector->regNum = vec_size; vector->reg = &insn->src(1); vector->offsetID = 1; vector->isSrc = 1; } else { // tmp regs for OWORD write header and values SelectionVector *vector = this->appendVector(); vector->regNum = vec_size + 1; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; } } void Selection::Opaque::MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size) { SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1); insn->src(0) = header; insn->setbti(bti); insn->extra.elem = response_size; // send response length for (uint32_t i = 0; i < tmp_size; ++i) { insn->dst(i) = dsts[i]; } SelectionVector *vector = this->appendVector(); vector->regNum = tmp_size; vector->reg = &insn->dst(0); vector->offsetID = 0; vector->isSrc = 0; } void Selection::Opaque::MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size) { SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size); insn->src(0) = header; for (uint32_t i = 0; i < tmp_size; ++i) insn->src(1 + i) = values[i]; insn->setbti(bti); insn->extra.elem = data_size; // msg data part size if (hasSends()) { insn->extra.splitSend = 1; SelectionVector *vector = this->appendVector(); vector->regNum = 1; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; vector = this->appendVector(); vector->regNum = tmp_size; vector->reg = &insn->src(1); vector->offsetID = 1; vector->isSrc = 1; } else { // We need to put the header and the data together SelectionVector *vector = this->appendVector(); vector->regNum = 1 + tmp_size; vector->reg = &insn->src(0); vector->offsetID = 0; vector->isSrc = 1; } } // Boiler plate to initialize the selection library at c++ pre-main static SelectionLibrary *selLib = NULL; static void destroySelectionLibrary(void) { GBE_DELETE(selLib); } static struct SelectionLibraryInitializer { SelectionLibraryInitializer(void) { selLib = GBE_NEW_NO_ARG(SelectionLibrary); atexit(destroySelectionLibrary); } } selectionLibraryInitializer; bool Selection::Opaque::isRoot(const ir::Instruction &insn) const { if (insn.hasSideEffect() || insn.isMemberOf() || insn.isMemberOf()) return true; // No side effect, not a branch and no destination? Impossible GBE_ASSERT(insn.getDstNum() >= 1); // Root if alive outside the block. // XXX we should use Value and not registers in liveness info const ir::BasicBlock *insnBlock = insn.getParent(); const ir::Liveness &liveness = this->ctx.getLiveness(); const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(insnBlock); for(uint32_t i = 0; i < insn.getDstNum(); i++) { const ir::Register reg = insn.getDst(i); if (liveOut.contains(reg)) return true; } // The instruction is only used in the current basic block return false; } bool Selection::Opaque::hasQWord(const ir::Instruction &insn) { for (uint32_t i = 0; i < insn.getSrcNum(); i++) { const ir::Register reg = insn.getSrc(i); if (getRegisterFamily(reg) == ir::FAMILY_QWORD) return true; } for (uint32_t i = 0; i < insn.getDstNum(); i++) { const ir::Register reg = insn.getDst(i); if (getRegisterFamily(reg) == ir::FAMILY_QWORD) return true; } return false; } uint32_t Selection::Opaque::buildBasicBlockDAG(const ir::BasicBlock &bb) { using namespace ir; // Clear all registers for (uint32_t regID = 0; regID < this->regNum; ++regID) this->regDAG[regID] = NULL; this->block->hasBarrier = false; this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA || bb.getLastInstruction()->getOpcode() == OP_RET; if (!this->block->hasBranch) this->block->endifOffset = -1; // Build the DAG on the fly uint32_t insnNum = 0; const_cast(bb).foreach([&](const Instruction &insn) { if (insn.getOpcode() == OP_SYNC) this->block->hasBarrier = true; // Build a selectionDAG node for instruction SelectionDAG *dag = this->newSelectionDAG(insn); // Point to non-root children const uint32_t srcNum = insn.getSrcNum(); for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const ir::Register reg = insn.getSrc(srcID); SelectionDAG *child = this->regDAG[reg]; if (child) { const ir::Instruction &childInsn = child->insn; const uint32_t childSrcNum = childInsn.getSrcNum(); // We can merge a child only if its sources are still valid bool mergeable = true; for (uint32_t otherID = 0; otherID < childSrcNum; ++otherID) { const SelectionDAG *srcDAG = child->child[otherID]; const ir::Register srcReg = childInsn.getSrc(otherID); SelectionDAG *currDAG = this->regDAG[srcReg]; if (srcDAG != currDAG) { mergeable = false; break; } } if (mergeable) dag->setAsMergeable(srcID); dag->child[srcID] = child; // Check whether this bool is used as a normal source // oprand other than BRA/SEL. if (getRegisterFamily(reg) == FAMILY_BOOL) { if ((insn.getOpcode() != OP_BRA && (insn.getOpcode() != OP_SEL || (insn.getOpcode() == OP_SEL && srcID != 0))) || (isScalarReg(reg))) child->computeBool = true; } child->isUsed = true; } else dag->child[srcID] = NULL; } // Make it a root if we must if (this->isRoot(insn)) dag->isRoot = 1; // Save the DAG <-> instruction mapping this->insnDAG[insnNum++] = dag; // Associate all output registers to this instruction const uint32_t dstNum = insn.getDstNum(); for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const ir::Register reg = insn.getDst(dstID); this->regDAG[reg] = dag; } }); return insnNum; } extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp #define SET_SEL_DBGINFO(I) \ if(OCL_DEBUGINFO) \ this->setDBGInfo_SEL(I.DBGInfo) void Selection::Opaque::matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum) { // Bottom up code generation bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier; needEndif = needEndif && bb.needEndif; if (needEndif) { if(!bb.needIf) // this basic block is the exit of a structure this->ENDIF(GenRegister::immd(0), bb.endifLabel, bb.endifLabel); else { const ir::BasicBlock *next = bb.getNextBlock(); this->ENDIF(GenRegister::immd(0), next->getLabelIndex()); } } for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) { // Process all possible patterns for this instruction SelectionDAG &dag = *insnDAG[insnID]; SET_SEL_DBGINFO(dag.insn); if (dag.isRoot) { const ir::Instruction &insn = dag.insn; const ir::Opcode opcode = insn.getOpcode(); auto it = selLib->patterns[opcode].begin(); const auto end = selLib->patterns[opcode].end(); // Start a new code fragment this->startBackwardGeneration(); // If there is no branch at the end of this block. // Try all the patterns from best to worst do { if ((*it)->emit(*this, dag)) break; ++it; } while (it != end); GBE_ASSERT(it != end); // If we are in if/endif fix mode, and this block is // large enough, we need to insert endif/if pair to eliminate // the too long if/endif block. if (this->ctx.getIFENDIFFix() && this->block->insnList.size() != 0 && this->block->insnList.size() % 1000 == 0 && this->block->endifLabel.value() != 0) { this->curr.flag = 0; this->curr.subFlag = 1; ir::LabelIndex jip = this->block->endifLabel; this->ENDIF(GenRegister::immd(0), jip); this->push(); this->curr.predicate = GEN_PREDICATE_NORMAL; this->IF(GenRegister::immd(0), jip, jip); this->pop(); } // Output the code in the current basic block this->endBackwardGeneration(); } } } #undef SET_SEL_DBGINFO void Selection::Opaque::select(void) { using namespace ir; const Function &fn = ctx.getFunction(); // Perform the selection per basic block fn.foreachBlock([&](const BasicBlock &bb) { this->dagPool.rewind(); this->appendBlock(bb); const uint32_t insnNum = this->buildBasicBlockDAG(bb); this->matchBasicBlock(bb, insnNum); }); } void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform) { SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum); SelectionVector *dstVector = this->appendVector(); SelectionVector *msgVector = this->appendVector(); // Regular instruction to encode for (uint32_t elemID = 0; elemID < dstNum; ++elemID) insn->dst(elemID) = dst[elemID]; for (uint32_t elemID = 0; elemID < msgNum; ++elemID) insn->src(elemID) = msgPayloads[elemID]; // Sends require contiguous allocation dstVector->regNum = dstNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); // Only the messages require contiguous registers. msgVector->regNum = msgNum; msgVector->isSrc = 1; msgVector->offsetID = 0; msgVector->reg = &insn->src(0); insn->setbti(bti); insn->extra.sampler = sampler; insn->extra.rdmsglen = msgNum; insn->extra.isLD = isLD; insn->extra.isUniform = isUniform; } void Selection::Opaque::VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub) { SelectionInstruction *insn = this->appendInsn(SEL_OP_VME, dstNum, srcNum); SelectionVector *dstVector = this->appendVector(); SelectionVector *msgVector = this->appendVector(); for (uint32_t elemID = 0; elemID < dstNum; ++elemID) insn->dst(elemID) = dst[elemID]; for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(elemID) = payloadVal[elemID]; dstVector->regNum = dstNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); msgVector->regNum = srcNum; msgVector->isSrc = 1; msgVector->offsetID = 0; msgVector->reg = &insn->src(0); insn->setbti(bti); insn->extra.msg_type = msg_type; insn->extra.vme_search_path_lut = vme_search_path_lut; insn->extra.lut_sub = lut_sub; } void Selection::Opaque::IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type) { SelectionInstruction *insn = this->appendInsn(SEL_OP_IME, dstNum, srcNum); SelectionVector *dstVector = this->appendVector(); for (uint32_t elemID = 0; elemID < dstNum; ++elemID) insn->dst(elemID) = dst[elemID]; for (uint32_t elemID = 0; elemID < srcNum; ++elemID) insn->src(elemID) = payloadVal[elemID]; dstVector->regNum = dstNum; dstVector->isSrc = 0; dstVector->offsetID = 0; dstVector->reg = &insn->dst(0); insn->setbti(bti); insn->extra.ime_msg_type = msg_type; } /////////////////////////////////////////////////////////////////////////// // Code selection public implementation /////////////////////////////////////////////////////////////////////////// const GenContext& Selection::getCtx() { return this->opaque->ctx; } Selection::Selection(GenContext &ctx) { this->blockList = NULL; this->opaque = GBE_NEW(Selection::Opaque, ctx); this->opaque->setSlowByteGather(true); opt_features = 0; } Selection75::Selection75(GenContext &ctx) : Selection(ctx) { this->opaque->setSlowByteGather(false); opt_features = 0; } Selection8::Selection8(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setHasDoubleType(true); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD; } SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setHasDoubleType(true); this->opaque->setLongRegRestrict(true); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT; } Selection9::Selection9(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); this->opaque->setHasSends(true); opt_features = SIOF_LOGICAL_SRCMOD; } SelectionBxt::SelectionBxt(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setLongRegRestrict(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT; } SelectionKbl::SelectionKbl(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); this->opaque->setHasSends(true); opt_features = SIOF_LOGICAL_SRCMOD; } SelectionGlk::SelectionGlk(GenContext &ctx) : Selection(ctx) { this->opaque->setHas32X32Mul(true); this->opaque->setHasLongType(true); this->opaque->setLongRegRestrict(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT; } void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D) { uint32_t elemID = 0; uint32_t i; SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum); for( i = 0; i < msgNum; ++i, ++elemID) insn->src(elemID) = msgs[i]; insn->setbti(bti); insn->extra.msglen = msgNum; insn->extra.is3DWrite = is3D; if (hasSends()) { assert(msgNum == 9); insn->extra.typedWriteSplitSend = 1; //header + coords SelectionVector *msgVector = this->appendVector(); msgVector->regNum = 5; msgVector->isSrc = 1; msgVector->offsetID = 0; msgVector->reg = &insn->src(0); //data msgVector = this->appendVector(); msgVector->regNum = 4; msgVector->isSrc = 1; msgVector->offsetID = 5; msgVector->reg = &insn->src(5); } else { // Send require contiguous allocation SelectionVector *msgVector = this->appendVector(); msgVector->regNum = msgNum; msgVector->isSrc = 1; msgVector->offsetID = 0; msgVector->reg = &insn->src(0); } } Selection::~Selection(void) { GBE_DELETE(this->opaque); } void Selection::select(void) { this->opaque->select(); this->blockList = &this->opaque->blockList; } uint32_t Selection::getLargestBlockSize(void) const { return this->opaque->getLargestBlockSize(); } uint32_t Selection::getVectorNum(void) const { return this->opaque->getVectorNum(); } uint32_t Selection::getRegNum(void) const { return this->opaque->getRegNum(); } ir::RegisterFamily Selection::getRegisterFamily(ir::Register reg) const { return this->opaque->getRegisterFamily(reg); } ir::RegisterData Selection::getRegisterData(ir::Register reg) const { return this->opaque->getRegisterData(reg); } ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) { return this->opaque->replaceSrc(insn, regID, type, needMov); } ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) { return this->opaque->replaceDst(insn, regID, type, needMov); } bool Selection::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) { return this->opaque->spillRegs(spilledRegs, registerPool); } bool Selection::isScalarReg(const ir::Register ®) const { return this->opaque->isScalarReg(reg); } bool Selection::isPartialWrite(const ir::Register ®) const { return this->opaque->isPartialWrite(reg); } SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) { return this->opaque->create(opcode, dstNum, srcNum); } /////////////////////////////////////////////////////////////////////////// // Implementation of all patterns /////////////////////////////////////////////////////////////////////////// bool canGetRegisterFromImmediate(const ir::Instruction &insn) { using namespace ir; const auto &childInsn = cast(insn); const auto &imm = childInsn.getImmediate(); if(imm.getType() != TYPE_DOUBLE && imm.getType() != TYPE_S64 && imm.getType() != TYPE_U64) return true; return false; } GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false) { using namespace ir; int sign = negate ? -1 : 1; switch (type) { case TYPE_U32: return GenRegister::immud(imm.getIntegerValue() * sign); case TYPE_S32: return GenRegister::immd(imm.getIntegerValue() * sign); case TYPE_FLOAT: return GenRegister::immf(imm.getFloatValue() * sign); case TYPE_U16: return GenRegister::immuw(imm.getIntegerValue() * sign); case TYPE_S16: return GenRegister::immw((int16_t)imm.getIntegerValue() * sign); case TYPE_U8: return GenRegister::immuw(imm.getIntegerValue() * sign); case TYPE_S8: return GenRegister::immw((int8_t)imm.getIntegerValue() * sign); case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign); case TYPE_BOOL: return GenRegister::immw((imm.getIntegerValue() == 0) ? 0 : -1); //return 0xffff when true case TYPE_HALF: { ir::half hf = imm.getHalfValue(); int16_t _sign = negate ? -1 : 1; ir::half hfSign = ir::half::convToHalf(_sign); hf = hf * hfSign; return GenRegister::immh(hf.getVal()); } default: NOT_SUPPORTED; return GenRegister::immuw(0); } } BVAR(OCL_OPTIMIZE_IMMEDIATE, true); void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, SelectionDAG *dag0, SelectionDAG *dag1, GenRegister &src0, GenRegister &src1, ir::Type type, bool &inverse) { using namespace ir; inverse = false; // Right source can always be an immediate const int src0Index = dag.insn.isMemberOf() ? SelectInstruction::src0Index : 0; const int src1Index = dag.insn.isMemberOf() ? SelectInstruction::src1Index : 1; if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) { const auto &childInsn = cast(dag1->insn); src0 = this->selReg(dag.insn.getSrc(src0Index), type); src1 = getRegisterFromImmediate(childInsn.getImmediate(), type); if (dag0) dag0->isRoot = 1; } // Left source cannot be immediate but it is OK if we can commute else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf() && ((cast(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) { const auto &childInsn = cast(dag0->insn); src0 = dag.insn.getOpcode() != OP_SUB ? this->selReg(dag.insn.getSrc(src1Index), type) : GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type)); Immediate imm = childInsn.getImmediate(); src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB); if (dag1) dag1->isRoot = 1; } // If it's a compare instruction, theoritically, we can easily revert the condition code to // switch the two operands. But we can't do that for float due to the NaN's exist. // For a normal select instruction, we can always inverse the predication to switch the two // operands' position. else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) && ((dag.insn.isMemberOf() && type != TYPE_FLOAT && type != TYPE_DOUBLE) || (dag.insn.isMemberOf()))) { const auto &childInsn = cast(dag0->insn); src0 = this->selReg(dag.insn.getSrc(src1Index), type); src1 = getRegisterFromImmediate(childInsn.getImmediate(), type); inverse = true; if (dag1) dag1->isRoot = 1; } // Just grab the two sources else { src0 = this->selReg(dag.insn.getSrc(src0Index), type); src1 = this->selReg(dag.insn.getSrc(src1Index), type); markAllChildren(dag); } } void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0, GenRegister &src1, ir::Type type, bool &inverse) { SelectionDAG *dag0 = dag.child[0]; SelectionDAG *dag1 = dag.child[1]; getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse); } /*! Template for the one-to-many instruction patterns */ template class OneToManyPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ OneToManyPattern(uint32_t insnNum, uint32_t cost) : SelectionPattern(insnNum, cost) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } /*! Call the child method with the proper prototype */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { bool markChildren = true; if (static_cast(this)->emitOne(sel, ir::cast(dag.insn), markChildren)) { if (markChildren) markAllChildren(dag); return true; } return false; } }; /*! Declare a naive one-to-many pattern */ #define DECL_PATTERN(FAMILY) \ struct FAMILY##Pattern : public OneToManyPattern #define DECL_CTOR(FAMILY, INSN_NUM, COST) \ FAMILY##Pattern(void) : OneToManyPattern(INSN_NUM, COST) {} /*! Nullary instruction patterns */ class NullaryInstructionPattern : public SelectionPattern { public: NullaryInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::NullaryInstruction &insn = cast(dag.insn); const Opcode opcode = insn.getOpcode(); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); sel.push(); if (sel.isScalarReg(insn.getDst(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } switch (opcode) { case ir::OP_SIMD_SIZE: { const GenRegister src = GenRegister::immud(sel.ctx.getSimdWidth()); sel.MOV(dst, src); } break; case ir::OP_SIMD_ID: { GenRegister laneID = sel.getLaneIDReg(); sel.MOV(dst, laneID); } break; default: NOT_SUPPORTED; } sel.pop(); return true; } }; /*! Unary instruction patterns */ DECL_PATTERN(UnaryInstruction) { static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType, bool isSrc = false) { if (opcode == ir::OP_CBIT) return isSrc ? insnType : ir::TYPE_U32; if (insnType == ir::TYPE_BOOL) return ir::TYPE_U16; else if (opcode == ir::OP_MOV && (insnType == ir::TYPE_U32 || insnType == ir::TYPE_S32)) return ir::TYPE_FLOAT; else return insnType; } INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn, bool &markChildren) const { const ir::Opcode opcode = insn.getOpcode(); const ir::Type insnType = insn.getType(); const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType, false)); const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType, true)); sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } switch (opcode) { case ir::OP_ABS: { const GenRegister src_ = GenRegister::retype(src, getGenType(insnType)); const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType)); sel.MOV(dst_, GenRegister::abs(src_)); } break; case ir::OP_MOV: { sel.push(); auto dag = sel.regDAG[insn.getDst(0)]; if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL && dag->isUsed) { sel.curr.physicalFlag = 0; sel.curr.flagIndex = insn.getDst(0).value(); sel.curr.modFlag = 1; } sel.MOV(dst, src); sel.pop(); } break; case ir::OP_RNDD: sel.RNDD(dst, src); break; case ir::OP_RNDE: sel.RNDE(dst, src); break; case ir::OP_RNDU: sel.RNDU(dst, src); break; case ir::OP_RNDZ: sel.RNDZ(dst, src); break; case ir::OP_FBH: sel.FBH(dst, src); break; case ir::OP_FBL: sel.FBL(dst, src); break; case ir::OP_CBIT: sel.CBIT(dst, src); break; case ir::OP_LZD: sel.LZD(dst, src); break; case ir::OP_BFREV: sel.BFREV(dst, src); break; case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break; case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break; case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break; case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break; case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break; case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break; case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break; case ir::OP_BSWAP: { ir::Register tmp = sel.reg(getFamily(insnType)); const GenRegister src_ = GenRegister::retype(src, getGenType(insnType)); const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType)); sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType)); break; } case ir::OP_SIMD_ANY: { const GenRegister constZero = GenRegister::immuw(0);; const GenRegister constOne = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_DWORD)), GEN_TYPE_UD); const GenRegister flag01 = GenRegister::flag(0, 1); sel.push(); int simdWidth = sel.curr.execWidth; sel.curr.predicate = GEN_PREDICATE_NONE; sel.MOV(constOne, GenRegister::immud(1)); sel.curr.execWidth = 1; sel.curr.noMask = 1; sel.MOV(flag01, constZero); sel.curr.execWidth = simdWidth; sel.curr.noMask = 0; sel.curr.flag = 0; sel.curr.subFlag = 1; sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); if (sel.curr.execWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; else if (sel.curr.execWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; else NOT_IMPLEMENTED; sel.SEL(dst, constOne, constZero); sel.pop(); } break; case ir::OP_SIMD_ALL: { const GenRegister constZero = GenRegister::immuw(0); const GenRegister constOne = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_DWORD)), GEN_TYPE_UD); const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one); const GenRegister flag01 = GenRegister::flag(0, 1); sel.push(); int simdWidth = sel.curr.execWidth; sel.curr.predicate = GEN_PREDICATE_NONE; sel.MOV(constOne, GenRegister::immud(1)); sel.curr.execWidth = 1; sel.curr.noMask = 1; sel.MOV(flag01, regOne); sel.curr.execWidth = simdWidth; sel.curr.noMask = 0; sel.curr.flag = 0; sel.curr.subFlag = 1; sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero); if (sel.curr.execWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; else if (sel.curr.execWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; else NOT_IMPLEMENTED; sel.SEL(dst, constOne, constZero); sel.pop(); } break; default: NOT_SUPPORTED; } sel.pop(); return true; } DECL_CTOR(UnaryInstruction, 1, 1) }; /*! Binary regular instruction pattern */ class BinaryInstructionPattern : public SelectionPattern { public: BinaryInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const { using namespace ir; const ir::BinaryInstruction &insn = cast(dag.insn); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src0 = sel.selReg(insn.getSrc(0), type); GenRegister src1 = sel.selReg(insn.getSrc(1), type); const uint32_t simdWidth = sel.curr.execWidth; const bool isUniform = simdWidth == 1; const RegisterFamily family = getFamily(type); uint32_t function = (op == OP_DIV)? GEN_MATH_FUNCTION_INT_DIV_QUOTIENT : GEN_MATH_FUNCTION_INT_DIV_REMAINDER; //bytes and shorts must be converted to int for DIV and REM per GEN restriction if((family == FAMILY_WORD || family == FAMILY_BYTE) && (type != TYPE_HALF)) { GenRegister tmp0, tmp1; ir::Register reg = sel.reg(FAMILY_DWORD, isUniform); tmp0 = sel.selReg(reg, ir::TYPE_S32); sel.MOV(tmp0, src0); tmp1 = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_S32); sel.MOV(tmp1, src1); sel.MATH(tmp0, function, tmp0, tmp1); GenRegister unpacked; if(family == FAMILY_WORD) { unpacked = sel.unpacked_uw(reg); } else { unpacked = sel.unpacked_ub(reg); } unpacked = GenRegister::retype(unpacked, getGenType(type)); sel.MOV(dst, unpacked); } else if (type == TYPE_HALF) { ir::Register reg = sel.reg(FAMILY_DWORD, isUniform); GenRegister tmp0 = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_FLOAT); GenRegister tmp1 = sel.selReg(reg, ir::TYPE_FLOAT); sel.MOV(tmp0, src0); sel.MOV(tmp1, src1); GBE_ASSERT(op != OP_REM); sel.MATH(tmp0, GEN_MATH_FUNCTION_FDIV, tmp0, tmp1); GenRegister unpacked = GenRegister::retype(sel.unpacked_uw(reg), GEN_TYPE_HF); sel.MOV(unpacked, tmp0); sel.MOV(dst, unpacked); } else if (type == TYPE_S32 || type == TYPE_U32 ) { sel.MATH(dst, function, src0, src1); } else if(type == TYPE_FLOAT) { GBE_ASSERT(op != OP_REM); SelectionDAG *child0 = dag.child[0]; if (child0 && child0->insn.getOpcode() == OP_LOADI) { const auto &loadimm = cast(child0->insn); const Immediate imm = loadimm.getImmediate(); float immVal = imm.getFloatValue(); int* dwPtr = (int*)&immVal; //if immedia is a exactly pow of 2, it can be converted to RCP if((*dwPtr & 0x7FFFFF) == 0) { if(immVal == -1.0f) { GenRegister tmp = GenRegister::negate(src1); sel.MATH(dst, GEN_MATH_FUNCTION_INV, tmp); } else { sel.MATH(dst, GEN_MATH_FUNCTION_INV, src1); if(immVal != 1.0f) { GenRegister isrc = GenRegister::immf(immVal); sel.MUL(dst, dst, isrc); } } if(dag.child[1]) dag.child[1]->isRoot = 1; return true; } } sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1); } else if (type == TYPE_S64 || type == TYPE_U64) { GenRegister tmp[15]; int tmp_num = 13; for(int i=0; i < 13; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); tmp[i].type = GEN_TYPE_UD; } if (sel.hasLongType()) { if (!sel.isScalarReg(insn.getSrc(0))) { tmp[tmp_num] = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), src0.type); tmp_num++; } if (!sel.isScalarReg(insn.getSrc(1))) { tmp[tmp_num] = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), src1.type); tmp_num++; } /* We at least one tmp register to convert if dst is not scalar. */ if (!sel.isScalarReg(insn.getDst(0)) && sel.isScalarReg(insn.getSrc(0)) && sel.isScalarReg(insn.getSrc(1))) { GBE_ASSERT(tmp_num == 13); tmp[tmp_num] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); tmp_num++; } } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; if(op == OP_DIV) sel.I64DIV(dst, src0, src1, tmp, tmp_num); else sel.I64REM(dst, src0, src1, tmp, tmp_num); sel.pop(); } else if (type == TYPE_DOUBLE) { if (!sel.hasDoubleType()) GBE_ASSERT(0); GenRegister tmp[10]; int tmpNum = 7; ir::RegisterFamily fm; if (sel.ctx.getSimdWidth() == 16) { fm = FAMILY_WORD; } else { fm = FAMILY_DWORD; } /* madm and invm need special accumutor support, which require us in align16 mode. If any src is uniform, we need another tmp register and MOV the uniform one to it. Because the madm and invm will work in align16 mode, the channel mask is different from the align1 mode. So we can not directly write the result to the dst and need a tmp register to hold the result and MOV it to dst later. */ tmpNum++; //For the dst. if (src0.hstride == GEN_HORIZONTAL_STRIDE_0) tmpNum++; if (src1.hstride == GEN_HORIZONTAL_STRIDE_0) tmpNum++; for (int i = 0; i < tmpNum; i++) tmp[i] = GenRegister::df8grf(sel.reg(fm)); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.F64DIV(dst, src0, src1, tmp, tmpNum); sel.pop(); } else { GBE_ASSERT(0); } markAllChildren(dag); return true; } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::BinaryInstruction &insn = cast(dag.insn); const Opcode opcode = insn.getOpcode(); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); sel.push(); // Boolean values use scalars if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } if(opcode == OP_DIV || opcode == OP_REM) { bool ret = this->emitDivRemInst(sel, dag, opcode); sel.pop(); return ret; } // Immediates not supported if (opcode == OP_POW) { GenRegister src0 = sel.selReg(insn.getSrc(0), type); GenRegister src1 = sel.selReg(insn.getSrc(1), type); if(type == TYPE_FLOAT) { sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1); } else { NOT_IMPLEMENTED; } markAllChildren(dag); sel.pop(); return true; } // Look for immediate values GenRegister src0, src1; bool inverse = false; sel.getSrcGenRegImm(dag, src0, src1, type, inverse); // Output the binary instruction if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL && dag.isUsed) { GBE_ASSERT(insn.getOpcode() == OP_AND || insn.getOpcode() == OP_OR || insn.getOpcode() == OP_XOR); sel.curr.physicalFlag = 0; sel.curr.flagIndex = insn.getDst(0).value(); sel.curr.modFlag = 1; } switch (opcode) { case OP_ADD: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64); sel.I64ADD(dst, src0, src1, t); } else sel.ADD(dst, src0, src1); break; case OP_ADDSAT: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister tmp[5]; for(int i=0; i<5; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); tmp[i].type = GEN_TYPE_UD; } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64SATADD(dst, src0, src1, tmp); sel.pop(); break; } sel.push(); sel.curr.saturate = GEN_MATH_SATURATE_SATURATE; sel.ADD(dst, src0, src1); sel.pop(); break; case OP_XOR: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) sel.I64XOR(dst, src0, src1); else sel.XOR(dst, src0, src1); break; case OP_OR: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) sel.I64OR(dst, src0, src1); else sel.OR(dst, src0, src1); break; case OP_AND: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) sel.I64AND(dst, src0, src1); else sel.AND(dst, src0, src1); break; case OP_SUB: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64); sel.I64SUB(dst, src0, src1, t); } else sel.ADD(dst, src0, GenRegister::negate(src1)); break; case OP_SUBSAT: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister tmp[5]; for(int i=0; i<5; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); tmp[i].type = GEN_TYPE_UD; } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64SATSUB(dst, src0, src1, tmp); sel.pop(); break; } sel.push(); sel.curr.saturate = GEN_MATH_SATURATE_SATURATE; sel.ADD(dst, src0, GenRegister::negate(src1)); sel.pop(); break; case OP_SHL: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister tmp[6]; for(int i = 0; i < 6; i ++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64SHL(dst, src0, src1, tmp); sel.pop(); } else sel.SHL(dst, src0, src1); break; case OP_SHR: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister tmp[6]; for(int i = 0; i < 6; i ++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64SHR(dst, src0, src1, tmp); sel.pop(); } else sel.SHR(dst, src0, src1); break; case OP_ASR: if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) { GenRegister tmp[6]; for(int i = 0; i < 6; i ++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64ASR(dst, src0, src1, tmp); sel.pop(); } else sel.ASR(dst, src0, src1); break; case OP_MUL_HI: { GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); sel.MUL_HI(dst, src0, src1, temp); break; } case OP_I64_MUL_HI: { int tmp_num; GenRegister temp[9]; if (sel.hasLongType()) { for(int i=0; i<9; i++) { temp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); } tmp_num = 6; } else { for(int i=0; i<9; i++) { temp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); temp[i].type = GEN_TYPE_UD; } tmp_num = 9; } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64_MUL_HI(dst, src0, src1, temp, tmp_num); sel.pop(); break; } case OP_MUL: if (type == TYPE_U32 || type == TYPE_S32) { sel.pop(); return false; } else if (type == TYPE_S64 || type == TYPE_U64) { if (sel.hasLongType()) { GenRegister tmp; tmp = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); sel.I64MUL(dst, src0, src1, &tmp, true); } else { GenRegister tmp[6]; for(int i = 0; i < 6; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.I64MUL(dst, src0, src1, tmp, false); } } else sel.MUL(dst, src0, src1); break; case OP_HADD: { GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D); sel.HADD(dst, src0, src1, temp); break; } case OP_RHADD: { GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D); sel.RHADD(dst, src0, src1, temp); break; } case OP_I64HADD: { GenRegister tmp[4]; if (!sel.hasLongType()) { for(int i=0; i<4; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.I64HADD(dst, src0, src1, tmp, 4); } else { tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); sel.I64HADD(dst, src0, src1, tmp, 2); } break; } case OP_I64RHADD: { GenRegister tmp[4]; if (!sel.hasLongType()) { for(int i=0; i<4; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.I64RHADD(dst, src0, src1, tmp, 4); } else { tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); sel.I64RHADD(dst, src0, src1, tmp, 2); } break; } case OP_UPSAMPLE_SHORT: { dst = GenRegister::retype(sel.unpacked_uw(dst.reg()), GEN_TYPE_B); src0 = GenRegister::retype(sel.unpacked_uw(src0.reg()), GEN_TYPE_B); src1 = GenRegister::retype(sel.unpacked_uw(src1.reg()), GEN_TYPE_B); sel.MOV(dst, src1); dst = sel.getOffsetReg(dst, 0, typeSize(GEN_TYPE_B)); sel.MOV(dst, src0); break; } case OP_UPSAMPLE_INT: { dst = sel.unpacked_uw(dst.reg()); src0 = sel.unpacked_uw(src0.reg()); src1 = sel.unpacked_uw(src1.reg()); sel.MOV(dst, src1); dst = sel.getOffsetReg(dst, 0, typeSize(GEN_TYPE_W)); sel.MOV(dst, src0); break; } case OP_UPSAMPLE_LONG: sel.UPSAMPLE_LONG(dst, src0, src1); break; default: NOT_IMPLEMENTED; } sel.pop(); return true; } }; /*! MAD pattern */ class MulAddInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ MulAddInstructionPattern(void) : SelectionPattern(2, 1) { this->opcodes.push_back(ir::OP_ADD); this->opcodes.push_back(ir::OP_SUB); } /*! Implements base class */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false' // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization. if (!sel.ctx.relaxMath) return false; // MAD tend to increase liveness of the sources (since there are three of // them). TODO refine this strategy. Well, we should be able at least to // evaluate per basic block register pressure and selectively enable // disable MADs if (sel.ctx.limitRegisterPressure) return false; // We are good to try. We need a MUL for one of the two sources const ir::BinaryInstruction &insn = cast(dag.insn); if (insn.getType() != TYPE_FLOAT) return false; SelectionDAG *child0 = dag.child[0]; SelectionDAG *child1 = dag.child[1]; const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT); if (child0 && child0->insn.getOpcode() == OP_MUL) { GBE_ASSERT(cast(child0->insn).getType() == TYPE_FLOAT); const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT); const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT); GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT); if(insn.getOpcode() == ir::OP_SUB) src2 = GenRegister::negate(src2); sel.push(); if (sel.isScalarReg(insn.getDst(0))) sel.curr.execWidth = 1; sel.MAD(dst, src2, src0, src1); // order different on HW! sel.pop(); if (child0->child[0]) child0->child[0]->isRoot = 1; if (child0->child[1]) child0->child[1]->isRoot = 1; if (child1) child1->isRoot = 1; return true; } if (child1 && child1->insn.getOpcode() == OP_MUL) { GBE_ASSERT(cast(child1->insn).getType() == TYPE_FLOAT); GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT); const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT); const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT); if(insn.getOpcode() == ir::OP_SUB) src0 = GenRegister::negate(src0); sel.push(); if (sel.isScalarReg(insn.getDst(0))) sel.curr.execWidth = 1; sel.MAD(dst, src2, src0, src1); // order different on HW! sel.pop(); if (child1->child[0]) child1->child[0]->isRoot = 1; if (child1->child[1]) child1->child[1]->isRoot = 1; if (child0) child0->isRoot = 1; return true; } return false; } }; /*! there some patterns like: sqrt r1, r2; load r4, 1.0; ===> rqrt r3, r2 div r3, r4, r1; */ class SqrtDivInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ SqrtDivInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_DIV); } /*! Implements base class */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; // We are good to try. We need a MUL for one of the two sources const ir::BinaryInstruction &insn = cast(dag.insn); if (insn.getType() != TYPE_FLOAT) return false; SelectionDAG *child0 = dag.child[0]; SelectionDAG *child1 = dag.child[1]; const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT); if (child1 && child1->insn.getOpcode() == OP_SQR) { GBE_ASSERT(cast(child1->insn).getType() == TYPE_FLOAT); GenRegister srcSQR = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT); const GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_FLOAT); const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_FLOAT); float immVal = 0.0f; if (child0 && child0->insn.getOpcode() == OP_LOADI) { const auto &loadimm = cast(child0->insn); const Immediate imm = loadimm.getImmediate(); const Type type = imm.getType(); if (type == TYPE_FLOAT) immVal = imm.getFloatValue(); else if (type == TYPE_S32 || type == TYPE_U32) immVal = imm.getIntegerValue(); } sel.push(); if (sel.isScalarReg(insn.getDst(0))) sel.curr.execWidth = 1; if (immVal == 1.0f) { sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, srcSQR); if (child1->child[0]) child1->child[0]->isRoot = 1; } else { sel.MATH(tmp, GEN_MATH_FUNCTION_RSQ, srcSQR); if (immVal != 0.0f) { GenRegister isrc = GenRegister::immf(immVal); sel.MUL(dst, tmp, isrc); } else { sel.MUL(dst, src0, tmp); if (child0) child0->isRoot = 1; } if (child1->child[0]) child1->child[0]->isRoot = 1; } sel.pop(); return true; } return false; } }; /*! sel.{le,l,ge...} like patterns */ class SelectModifierInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ SelectModifierInstructionPattern(void) : SelectionPattern(2, 1) { this->opcodes.push_back(ir::OP_SEL); } /*! Implements base class */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; SelectionDAG *cmp = dag.child[0]; const SelectInstruction &insn = cast(dag.insn); if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support return false; // Not in this block if (cmp == NULL) return false; // We need to match a compare if (cmp->insn.isMemberOf() == false) return false; // We look for something like that: // cmp.{le,ge...} flag src0 src1 // sel dst flag src0 src1 // So both sources must match if (sourceMatch(cmp, 0, &dag, 1) == false) return false; if (sourceMatch(cmp, 1, &dag, 2) == false) return false; // OK, we merge the instructions const ir::CompareInstruction &cmpInsn = cast(cmp->insn); const ir::Opcode opcode = cmpInsn.getOpcode(); if(opcode == OP_ORD) return false; GenRegister src0, src1; const ir::Type type = cmpInsn.getType(); bool inverse = false; sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse); const uint32_t genCmp = getGenCompare(opcode, inverse); sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } // Like for regular selects, we need a temporary since we cannot predicate // properly const uint32_t simdWidth = sel.curr.execWidth; const GenRegister dst = sel.selReg(insn.getDst(0), type); sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.execWidth = simdWidth; sel.SEL_CMP(genCmp, dst, src0, src1); sel.pop(); return true; } }; /*! 32 bits integer multiply needs more instructions */ class Int32x32MulInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ Int32x32MulInstructionPattern(void) : SelectionPattern(1, 4) { this->opcodes.push_back(ir::OP_MUL); } /*! Implements base class */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::BinaryInstruction &insn = cast(dag.insn); const Type type = insn.getType(); if (type != TYPE_U32 && type != TYPE_S32) return false; GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src0 = sel.selReg(insn.getSrc(0), type); GenRegister src1 = sel.selReg(insn.getSrc(1), type); sel.push(); if (sel.has32X32Mul()) { if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MUL(dst, src0, src1); } else { if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } const int simdWidth = sel.curr.execWidth; // Either left part of the 16-wide register or just a simd 8 register dst = GenRegister::retype(dst, GEN_TYPE_D); src0 = GenRegister::retype(src0, GEN_TYPE_D); src1 = GenRegister::retype(src1, GEN_TYPE_D); sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), src0, src1); sel.curr.accWrEnable = 1; sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1); sel.curr.accWrEnable = 0; if (simdWidth == 1) { sel.curr.execWidth = 1; sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::vec1(GenRegister::acc())); } else { sel.curr.execWidth = 8; sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc()); } // Right part of the 16-wide register now if (simdWidth == 16) { int predicate = sel.curr.predicate; int noMask = sel.curr.noMask; sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; const GenRegister nextSrc0 = sel.selRegQn(insn.getSrc(0), 1, TYPE_S32); const GenRegister nextSrc1 = sel.selRegQn(insn.getSrc(1), 1, TYPE_S32); sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), nextSrc0, nextSrc1); sel.curr.accWrEnable = 1; sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), nextSrc0, nextSrc1); sel.curr.accWrEnable = 0; sel.curr.quarterControl = GEN_COMPRESSION_Q2; if (predicate != GEN_PREDICATE_NONE || noMask != 1) { const ir::Register reg = sel.reg(FAMILY_DWORD); sel.MOV(GenRegister::f8grf(reg), GenRegister::acc()); sel.curr.noMask = noMask;; sel.curr.predicate = predicate; sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::f8grf(reg)); } else sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc()); } } sel.pop(); // All children are marked as root markAllChildren(dag); return true; } }; /*! 32x16 bits integer can be done in one instruction */ class Int32x16MulInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ Int32x16MulInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_MUL); } bool is16BitSpecialReg(ir::Register reg) const { if (reg == ir::ocl::lid0 || reg == ir::ocl::lid1 || reg == ir::ocl::lid2 || reg == ir::ocl::lsize0 || reg == ir::ocl::lsize1 || reg == ir::ocl::lsize2 || reg == ir::ocl::enqlsize0 || reg == ir::ocl::enqlsize1 || reg == ir::ocl::enqlsize2) return true; else return false; } /*! Try to emit a multiply where child childID is a 16 immediate */ bool emitMulImmediate(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const { using namespace ir; const ir::BinaryInstruction &insn = cast(dag.insn); const Register dst = insn.getDst(0); const Register src1 = insn.getSrc(childID ^ 1); const SelectionDAG *src0DAG = dag.child[childID]; if (src0DAG != NULL) { if (src0DAG->insn.getOpcode() == OP_LOADI) { const auto &loadimm = cast(src0DAG->insn); const Immediate imm = loadimm.getImmediate(); const Type type = imm.getType(); GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32); if (type == TYPE_U32 && imm.getIntegerValue() <= 0xffff) { sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MUL(sel.selReg(dst, type), sel.selReg(src1, type), GenRegister::immuw(imm.getIntegerValue())); sel.pop(); if (dag.child[childID ^ 1] != NULL) dag.child[childID ^ 1]->isRoot = 1; return true; } if (type == TYPE_S32 && (imm.getIntegerValue() >= -32768 && imm.getIntegerValue() <= 32767)) { sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MUL(sel.selReg(dst, type), sel.selReg(src1, type), GenRegister::immw(imm.getIntegerValue())); sel.pop(); if (dag.child[childID ^ 1] != NULL) dag.child[childID ^ 1]->isRoot = 1; return true; } } } return false; } /*! Try to emit a multiply with a 16 bit special register */ bool emitMulSpecialReg(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const { using namespace ir; const BinaryInstruction &insn = cast(dag.insn); const Type type = insn.getType(); const Register dst = insn.getDst(0); const Register src0 = insn.getSrc(childID); const Register src1 = insn.getSrc(childID ^ 1); if (is16BitSpecialReg(src0)) { sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MUL(sel.selReg(dst, type), sel.selReg(src1, type), sel.selReg(src0, TYPE_U32)); sel.pop(); markAllChildren(dag); return true; } return false; } virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const BinaryInstruction &insn = cast(dag.insn); const Type type = insn.getType(); if (type == TYPE_U32 || type == TYPE_S32) { if (this->emitMulSpecialReg(sel, dag, 0)) return true; if (this->emitMulSpecialReg(sel, dag, 1)) return true; if (this->emitMulImmediate(sel, dag, 0)) return true; if (this->emitMulImmediate(sel, dag, 1)) return true; } return false; } }; #define DECL_NOT_IMPLEMENTED_ONE_TO_MANY(FAMILY) \ struct FAMILY##Pattern : public OneToManyPattern\ {\ INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn, bool &markChildren) const {\ NOT_IMPLEMENTED;\ return false;\ }\ DECL_CTOR(FAMILY, 1, 1); \ } #undef DECL_NOT_IMPLEMENTED_ONE_TO_MANY /*! Load immediate pattern */ DECL_PATTERN(LoadImmInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn, bool &markChildren) const { using namespace ir; const Type type = insn.getType(); const Immediate imm = insn.getImmediate(); const GenRegister dst = sel.selReg(insn.getDst(0), type); sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } switch (type) { case TYPE_BOOL: if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) { sel.curr.modFlag = 1; sel.curr.physicalFlag = 0; sel.curr.flagIndex = insn.getDst(0).value(); } sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0)); break; case TYPE_U32: sel.MOV(dst, GenRegister::immud(imm.getIntegerValue())); break; case TYPE_S32: sel.MOV(dst, GenRegister::immd(imm.getIntegerValue())); break; case TYPE_FLOAT: sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::immf(imm.asFloatValue())); break; case TYPE_HALF: { ir::half hf = imm.getHalfValue(); sel.MOV(GenRegister::retype(dst, GEN_TYPE_HF), GenRegister::immh(hf.getVal())); break; } case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break; case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break; case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break; case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break; case TYPE_DOUBLE: sel.MOV(dst, GenRegister::immdf(imm.getDoubleValue())); break; case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break; case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immuint64(imm.getIntegerValue())); break; default: NOT_SUPPORTED; } sel.pop(); return true; } DECL_CTOR(LoadImmInstruction, 1,1); }; /*! Sync instruction */ DECL_PATTERN(SyncInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn, bool &markChildren) const { using namespace ir; const ir::Register reg = sel.reg(FAMILY_DWORD); const uint32_t params = insn.getParameters(); // A barrier is OK to start the thread synchronization *and* SLM fence sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params); return true; } DECL_CTOR(SyncInstruction, 1,1); }; /*! Wait instruction */ DECL_PATTERN(WaitInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::WaitInstruction &insn, bool &markChildren) const { using namespace ir; // Debugwait will use reg 1, which is different from barrier sel.push(); sel.curr.noMask = 1; sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.WAIT(1); sel.pop(); return true; } DECL_CTOR(WaitInstruction, 1,1); }; INLINE uint32_t getByteScatterGatherSize(Selection::Opaque &sel, ir::Type type) { using namespace ir; switch (type) { case TYPE_DOUBLE: case TYPE_S64: case TYPE_U64: return GEN_BYTE_SCATTER_QWORD; case TYPE_FLOAT: case TYPE_U32: case TYPE_S32: return GEN_BYTE_SCATTER_DWORD; case TYPE_BOOL: case TYPE_U16: case TYPE_S16: return GEN_BYTE_SCATTER_WORD; case TYPE_U8: case TYPE_S8: return GEN_BYTE_SCATTER_BYTE; case TYPE_HALF: if (sel.hasHalfType()) return GEN_BYTE_SCATTER_WORD; default: NOT_SUPPORTED; return GEN_BYTE_SCATTER_BYTE; } } ir::Register generateLocalMask(Selection::Opaque &sel, GenRegister addr) { sel.push(); ir::Register localMask = sel.reg(ir::FAMILY_BOOL); sel.curr.physicalFlag = 0; sel.curr.modFlag = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.flagIndex = localMask; sel.CMP(GEN_CONDITIONAL_L, addr, GenRegister::immud(64*1024)); sel.pop(); return localMask; } class LoadInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ LoadInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_LOAD); } bool isReadConstantLegacy(const ir::LoadInstruction &load) const { ir::AddressMode AM = load.getAddressMode(); ir::AddressSpace AS = load.getAddressSpace(); if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT) return true; return false; } void untypedReadStateless(Selection::Opaque &sel, GenRegister addr, vector &dst ) const { using namespace ir; GenRegister addrQ; unsigned simdWidth = sel.curr.execWidth; unsigned addrBytes = typeSize(addr.type); unsigned valueNum = dst.size(); bool isUniform = sel.isScalarReg(dst[0].reg()); if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, addr); } else if (addrBytes == 8) { addrQ = addr; } else NOT_IMPLEMENTED; if (simdWidth == 8) { sel.UNTYPED_READA64(addrQ, dst.data(), valueNum, valueNum); } else if (simdWidth == 16) { std::vector tmpData; for (unsigned i = 0; i < (valueNum+1)/2; i++) { tmpData.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32)); } sel.push(); /* first quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 0), tmpData.data(), (valueNum+1)/2, valueNum); sel.push(); if (isUniform) sel.curr.execWidth = 1; for (unsigned k = 0; k < valueNum; k++) { sel.MOV(GenRegister::Qn(dst[k], 0), GenRegister::Qn(tmpData[k/2], k%2)); } sel.pop(); /* second quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 1), tmpData.data(), (valueNum+1)/2, valueNum); if (isUniform) sel.curr.execWidth = 1; for (unsigned k = 0; k < valueNum; k++) { sel.MOV(GenRegister::Qn(dst[k], 1), GenRegister::Qn(tmpData[k/2], k%2)); } sel.pop(); } } void shootUntypedReadMsg(Selection::Opaque &sel, const ir::LoadInstruction &insn, vector &dst, GenRegister addr, uint32_t valueNum, ir::AddressSpace addrSpace) const { using namespace ir; unsigned addrBytes = typeSize(addr.type); AddressMode AM = insn.getAddressMode(); /* Notes on uniform of LoadInstruction, all-lanes-active(noMask,noPredicate) * property should only need be taken care when the value is UNIFORM, if the * value is not uniform, just do things under predication or mask */ bool isUniform = sel.isScalarReg(dst[0].reg()); sel.push(); if (isUniform) { sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; } vector btiTemp = sel.getBTITemps(AM); if (AM == AM_DynamicBti || AM == AM_StaticBti) { if (AM == AM_DynamicBti) { Register btiReg = insn.getBtiReg(); sel.UNTYPED_READ(addr, dst.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp); } else { unsigned SI = insn.getSurfaceIndex(); sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp); } } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) { // stateless mode, local/constant still use bti access unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe; GenRegister addrDW = addr; if (addrBytes == 8) addrDW = convertU64ToU32(sel, addr); sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(bti), btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, addr); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = addr; if (addrBytes == 8) addrDW = convertU64ToU32(sel, addr); sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; untypedReadStateless(sel, addr, dst); sel.pop(); } else { untypedReadStateless(sel, addr, dst); } sel.pop(); } void emitUntypedRead(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister addr, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); vector dst(valueNum); for (uint32_t dstID = 0; dstID < valueNum; ++dstID) dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32); shootUntypedReadMsg(sel, insn, dst, addr, valueNum, addrSpace); } void emitDWordGather(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister addr, ir::AddressSpace addrSpace) const { using namespace ir; GBE_ASSERT(insn.getValueNum() == 1); const uint32_t isUniform = sel.isScalarReg(insn.getValue(0)); if(isUniform) { GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32); sel.push(); sel.curr.noMask = 1; sel.SAMPLE(&dst, 1, &addr, 1, BTI_CONSTANT, 0, true, true); sel.pop(); return; } GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F); // get dword based address GenRegister addrDW = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); sel.push(); if (sel.isScalarReg(addr.reg())) { sel.curr.noMask = 1; } if (sel.getRegisterFamily(addr.reg()) == FAMILY_QWORD) { // as we still use offset instead of absolut graphics address, // it is safe to convert from u64 to u32 GenRegister t = convertU64ToU32(sel, addr); sel.SHR(addrDW, t, GenRegister::immud(2)); } else sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2)); sel.pop(); sel.DWORD_GATHER(dst, addrDW, BTI_CONSTANT); } void read64Legacy(Selection::Opaque &sel, GenRegister addr, vector &dst, GenRegister bti, vector &btiTemp) const { const uint32_t valueNum = dst.size(); if (sel.hasLongType()) { vector tmp(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) { tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL); } sel.READ64(addr, dst.data(), tmp.data(), valueNum, bti, true, btiTemp); } else { sel.READ64(addr, dst.data(), NULL, valueNum, bti, false, btiTemp); } } void read64Stateless(Selection::Opaque &sel, const GenRegister addr, vector dst) const { using namespace ir; unsigned simdWidth = sel.ctx.getSimdWidth(); unsigned valueNum = dst.size(); vector tmp(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) { tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL); } unsigned addrBytes = typeSize(addr.type); GenRegister addrQ; sel.push(); if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, addr); } else { addrQ = addr; } if (simdWidth == 8) { sel.READ64A64(addrQ, dst.data(), tmp.data(), valueNum); } else { assert(valueNum == 1); GenRegister tmpAddr, tmpDst; tmpAddr = GenRegister::Qn(addrQ, 0); tmpDst = GenRegister::Qn(dst[0], 0); sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum); tmpAddr = GenRegister::Qn(addrQ, 1); tmpDst = GenRegister::Qn(dst[0], 1); sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum); } sel.pop(); } void emitRead64(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister addr, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); /* XXX support scalar only right now. */ GBE_ASSERT(valueNum == 1); vector dst(valueNum); for ( uint32_t dstID = 0; dstID < valueNum; ++dstID) dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64); bool isUniform = sel.isScalarReg(insn.getValue(0)); unsigned addrBytes = typeSize(addr.type); AddressMode AM = insn.getAddressMode(); vector btiTemp = sel.getBTITemps(AM); sel.push(); if (isUniform) { sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; } if (AM != AM_Stateless) { GenRegister b; if (AM == AM_DynamicBti) { b = sel.selReg(insn.getBtiReg(), TYPE_U32); } else { b = GenRegister::immud(insn.getSurfaceIndex()); } read64Legacy(sel, addr, dst, b, btiTemp); } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) { GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT); GenRegister addrDW = addr; if (addrBytes == 8) addrDW = convertU64ToU32(sel, addr); read64Legacy(sel, addrDW, dst, b, btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, addr); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = addr; if (addrBytes == 8) addrDW = convertU64ToU32(sel, addr); read64Legacy(sel, addrDW, dst, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; read64Stateless(sel, addr, dst); sel.pop(); } else { read64Stateless(sel, addr, dst); } sel.pop(); } void readByteAsDWord(Selection::Opaque &sel, const ir::LoadInstruction &insn, const uint32_t elemSize, GenRegister address, GenRegister dst, bool isUniform, ir::AddressSpace addrSpace) const { using namespace ir; RegisterFamily addrFamily = sel.getRegisterFamily(address.reg()); Type addrType = getType(addrFamily); Register tmpReg = sel.reg(FAMILY_DWORD, isUniform); GenRegister tmpAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType); GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32); GenRegister addrOffset = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); // Get dword aligned addr sel.push(); if (isUniform) { sel.curr.noMask = 1; sel.curr.execWidth = 1; } if (addrFamily == FAMILY_DWORD) sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc)); else { sel.MOV(tmpAddr, GenRegister::immuint64(0xfffffffffffffffc)); sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UL), tmpAddr); } sel.pop(); sel.push(); vector tmp; tmp.push_back(tmpData); shootUntypedReadMsg(sel, insn, tmp, tmpAddr, 1, addrSpace); if (isUniform) sel.curr.noMask = 1; if (isUniform) sel.curr.execWidth = 1; // Get the remaining offset from aligned addr if (addrFamily == FAMILY_QWORD) { sel.AND(addrOffset, sel.unpacked_ud(address.reg()), GenRegister::immud(0x3)); } else { sel.AND(addrOffset, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3)); } sel.SHL(addrOffset, addrOffset, GenRegister::immud(0x3)); sel.SHR(tmpData, tmpData, addrOffset); if (elemSize == GEN_BYTE_SCATTER_WORD) sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniform, sel.isLongReg(tmpReg))); else if (elemSize == GEN_BYTE_SCATTER_BYTE) sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniform)); sel.pop(); } // The address is dw aligned. void emitAlignedByteGather(Selection::Opaque &sel, const ir::LoadInstruction &insn, const uint32_t elemSize, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); const bool isUniform = sel.isScalarReg(insn.getValue(0)); RegisterFamily family = getFamily(insn.getValueType()); vector dst(valueNum); const uint32_t typeSize = getFamilySize(family); for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4; vector tmp(tmpRegNum); for(uint32_t i = 0; i < tmpRegNum; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); } shootUntypedReadMsg(sel, insn, tmp, address, tmpRegNum, addrSpace); for(uint32_t i = 0; i < tmpRegNum; i++) { unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? 4/typeSize : (valueNum - i * (4 / typeSize)); sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], typeSize, elemNum); } } // Gather effect data to the effectData vector from the tmp vector. // x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ... void getEffectByteData(Selection::Opaque &sel, vector &effectData, vector &tmp, uint32_t effectDataNum, const GenRegister &address, bool isUniform) const { using namespace ir; GBE_ASSERT(effectData.size() == effectDataNum); GBE_ASSERT(tmp.size() == effectDataNum + 1); RegisterFamily addrFamily = sel.getRegisterFamily(address.reg()); sel.push(); Register alignedFlag = sel.reg(FAMILY_BOOL, isUniform); GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); Register shiftHReg = sel.reg(FAMILY_DWORD, isUniform); GenRegister shiftH = sel.selReg(shiftHReg, ir::TYPE_U32); sel.push(); if (isUniform) sel.curr.noMask = 1; if (addrFamily == FAMILY_QWORD) { GenRegister t = convertU64ToU32(sel, address); sel.AND(shiftL, t, GenRegister::immud(0x3)); } else { sel.AND(shiftL, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3)); } sel.SHL(shiftL, shiftL, GenRegister::immud(0x3)); sel.ADD(shiftH, GenRegister::negate(shiftL), GenRegister::immud(32)); sel.curr.physicalFlag = 0; sel.curr.modFlag = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.flagIndex = alignedFlag.value(); sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shiftHReg), GenRegister::immuw(32)); sel.pop(); sel.curr.noMask = 1; for(uint32_t i = 0; i < effectDataNum; i++) { GenRegister tmpH = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); GenRegister tmpL = effectData[i]; sel.SHR(tmpL, tmp[i], shiftL); sel.push(); // Only need to consider the tmpH when the addr is not aligned. sel.curr.modFlag = 0; sel.curr.physicalFlag = 0; sel.curr.flagIndex = alignedFlag.value(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.SHL(tmpH, tmp[i + 1], shiftH); sel.OR(effectData[i], tmpL, tmpH); sel.pop(); } sel.pop(); } /* Used to transform address from 64bit to 32bit, note as dataport messages * cannot accept scalar register, so here to convert to non-uniform * register here. */ GenRegister convertU64ToU32(Selection::Opaque &sel, GenRegister addr) const { GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD); GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32); sel.MOV(dst, unpacked); return dst; } void byteGatherStateless(Selection::Opaque &sel, GenRegister addr, GenRegister dst, unsigned elemSize) const { using namespace ir; GenRegister addrQ; unsigned simdWidth = sel.ctx.getSimdWidth(); unsigned addrBytes = typeSize(addr.type); if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, addr); } else { addrQ = addr; } sel.push(); if (simdWidth == 8) { sel.BYTE_GATHERA64(dst, addrQ, elemSize); } else if (simdWidth == 16) { sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.BYTE_GATHERA64(GenRegister::Qn(dst, 0), GenRegister::Qn(addrQ, 0), elemSize); sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.BYTE_GATHERA64(GenRegister::Qn(dst, 1), GenRegister::Qn(addrQ, 1), elemSize); } sel.pop(); } void shootByteGatherMsg(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister dst, GenRegister addr, unsigned elemSize, bool isUniform, ir::AddressSpace addrSpace) const { using namespace ir; unsigned addrBytes = typeSize(addr.type); AddressMode AM = insn.getAddressMode(); vector btiTemp = sel.getBTITemps(AM); if (AM == AM_DynamicBti || AM == AM_StaticBti) { if (AM == AM_DynamicBti) { Register btiReg = insn.getBtiReg(); sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp); } else { unsigned SI = insn.getSurfaceIndex(); sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp); } } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) { unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe; GenRegister addrDW = addr; if (addrBytes == 8) { addrDW = convertU64ToU32(sel, addr); } sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(bti), btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, addr); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = addr; if (addrBytes == 8) addrDW = convertU64ToU32(sel, addr); sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; byteGatherStateless(sel, addr, dst, elemSize); sel.pop(); } else { byteGatherStateless(sel, addr, dst, elemSize); } } void emitUnalignedByteGather(Selection::Opaque &sel, const ir::LoadInstruction &insn, const uint32_t elemSize, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth(); const bool isUniform = simdWidth == 1; RegisterFamily family = getFamily(insn.getValueType()); RegisterFamily addrFamily = sel.getRegisterFamily(address.reg()); Type addrType = getType(addrFamily); if(valueNum > 1) { GBE_ASSERT(!isUniform && "vector load should not be uniform. Something went wrong."); //need to investigate the case of GEN_BYTE_SCATTER_WORD later //for GEN_BYTE_SCATTER_BYTE, if the pointer is not aligned to 4, using byte gather, // on BDW, vec8 and vec16 are worse. on SKL/BXT, vec16 is worse. if(sel.getSlowByteGather() || elemSize == GEN_BYTE_SCATTER_WORD || (elemSize == GEN_BYTE_SCATTER_BYTE && (valueNum == 16 || valueNum == 8))) { vector dst(valueNum); const uint32_t typeSize = getFamilySize(family); for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); uint32_t effectDataNum = (typeSize*valueNum + 3) / 4; vector tmp(effectDataNum + 1); vector effectData(effectDataNum); for(uint32_t i = 0; i < effectDataNum + 1; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); GenRegister alignedAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType); sel.push(); if (isUniform) sel.curr.noMask = 1; if (addrFamily == FAMILY_DWORD) sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3)); else { sel.MOV(alignedAddr, GenRegister::immuint64(~0x3ul)); sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UL), alignedAddr); } sel.pop(); uint32_t remainedReg = effectDataNum + 1; uint32_t pos = 0; do { uint32_t width = remainedReg > 4 ? 4 : remainedReg; vector t1(tmp.begin() + pos, tmp.begin() + pos + width); if (pos != 0) { sel.push(); if (isUniform) sel.curr.noMask = 1; if (addrFamily == FAMILY_DWORD) sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4)); else sel.ADD(alignedAddr, alignedAddr, GenRegister::immuint64(pos * 4)); sel.pop(); } shootUntypedReadMsg(sel, insn, t1, alignedAddr, width, addrSpace); remainedReg -= width; pos += width; } while(remainedReg); for(uint32_t i = 0; i < effectDataNum; i++) effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); getEffectByteData(sel, effectData, tmp, effectDataNum, address, isUniform); for(uint32_t i = 0; i < effectDataNum; i++) { unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? 4/typeSize : (valueNum - i * (4 / typeSize)); sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum); } } else { GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE); vector dst(valueNum); for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); GenRegister readDst = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); uint32_t valueIndex = 0; uint32_t loopCount = (valueNum + 3) / 4; GenRegister addressForLoop = address; sel.push(); if (loopCount > 1) { addressForLoop = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); sel.MOV(addressForLoop, address); } for (uint32_t i = 0; i < loopCount; ++i) { uint32_t valueLeft = valueNum - valueIndex; GBE_ASSERT(valueLeft > 1); uint32_t dataSize = 0; if (valueLeft == 2) dataSize = GEN_BYTE_SCATTER_WORD; else dataSize = GEN_BYTE_SCATTER_DWORD; shootByteGatherMsg(sel, insn, readDst, addressForLoop, dataSize, isUniform, addrSpace); // only 4 bytes is gathered even if valueLeft >= 4 sel.UNPACK_BYTE(dst.data(), readDst, getFamilySize(FAMILY_BYTE), (valueLeft < 4 ? valueLeft : 4)); valueIndex += 4; //calculate the new address to read if (valueIndex < valueNum) sel.ADD(addressForLoop, addressForLoop, GenRegister::immud(4)); } sel.pop(); } } else { GBE_ASSERT(insn.getValueNum() == 1); const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType()); GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE); if (sel.getSlowByteGather()) readByteAsDWord(sel, insn, elemSize, address, value, isUniform, addrSpace); else { // We need a temporary register if we read bytes or words Register dst = sel.reg(FAMILY_DWORD); sel.push(); if (isUniform) sel.curr.noMask = 1; shootByteGatherMsg(sel, insn, sel.selReg(dst, ir::TYPE_U32), address, elemSize, isUniform, addrSpace); sel.pop(); sel.push(); if (isUniform) { sel.curr.noMask = 1; sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; } if (elemSize == GEN_BYTE_SCATTER_WORD) sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst, isUniform)); else if (elemSize == GEN_BYTE_SCATTER_BYTE) sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst, isUniform)); sel.pop(); } } } void emitOWordRead(Selection::Opaque &sel, const ir::LoadInstruction &insn, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; uint32_t SI = insn.getSurfaceIndex(); const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); const Type type = insn.getValueType(); const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; const RegisterFamily family = getFamily(type); bool isA64 = SI == 255; const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); vector valuesVec; vector tmpVec; for(uint32_t i = 0; i < vec_size; i++) valuesVec.push_back(sel.selReg(insn.getValue(i), type)); GenRegister headeraddr; if (isA64) headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL); else headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD); // Make header sel.push(); { // Copy r0 into the header first sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(header, GenRegister::ud8grf(0, 0)); // Update the header with the current address sel.curr.execWidth = 1; // Put zero in the general state base address if (isA64) sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL)); else { sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD)); sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); } } sel.pop(); /* For block read we need to unpack the block date into values, and for different * simdwidth and vector size with different type size, we may need to spilt the * block read send message. * We can only get a send message with 5 reg length * so for different combination we have different message length and tmp vector size * | simd8 | simd16 | simd8 | simd16 * r0 |header | | | | * r1 |date | w0,w1 | w0 | dw0 | dw0 * r2 |date | w2,w3 | w1 | dw1 | dw0 * r3 |date | ...... | ...... | ...... | dw1 * r4 |date | ....... | ...... | ...... | dw1 */ uint32_t totalSize = simdWidth * typeSize * vec_size; uint32_t valueSize = simdWidth * typeSize; uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; uint32_t msg_num = vec_size / tmp_size; uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); for(uint32_t i = 0; i < tmp_size; i++) tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); for (uint32_t i = 0; i < msg_num; i++) { if (i > 0) { sel.push(); { // Update the address in header sel.curr.execWidth = 1; sel.ADD(headeraddr, headeraddr, GenRegister::immud(128)); } sel.pop(); } sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size); for (uint32_t j = 0; j < tmp_size; j++) sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]); } } // check whether all binded table index point to constant memory INLINE bool isAllConstant(const ir::BTI &bti) const { if (bti.isConst && bti.imm == BTI_CONSTANT) return true; return false; } /*! Implements base class */ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::LoadInstruction &insn = cast(dag.insn); Register reg = insn.getAddressRegister(); GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg))); GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL || insn.getAddressSpace() == MEM_CONSTANT || insn.getAddressSpace() == MEM_PRIVATE || insn.getAddressSpace() == MEM_LOCAL || insn.getAddressSpace() == MEM_GENERIC || insn.getAddressSpace() == MEM_MIXED); //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false); AddressSpace addrSpace = insn.getAddressSpace(); const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(sel, type); if (insn.isBlock()) this->emitOWordRead(sel, insn, address, addrSpace); else if (isReadConstantLegacy(insn)) { // XXX TODO read 64bit constant through constant cache // Per HW Spec, constant cache messages can read at least DWORD data. // So, byte/short data type, we have to read through data cache. if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) this->emitRead64(sel, insn, address, addrSpace); else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitDWordGather(sel, insn, address, addrSpace); else if (insn.isAligned() == true) this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace); else this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace); } else { if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) this->emitRead64(sel, insn, address, addrSpace); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedRead(sel, insn, address, addrSpace); else if (insn.isAligned()) this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace); else this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace); } markAllChildren(dag); return true; } }; class StoreInstructionPattern : public SelectionPattern { public: /*! Register the pattern for all opcodes of the family */ StoreInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_STORE); } GenRegister convertU64ToU32(Selection::Opaque &sel, GenRegister addr) const { GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD); GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32); sel.MOV(dst, unpacked); return dst; } void untypedWriteStateless(Selection::Opaque &sel, GenRegister address, vector &value) const { using namespace ir; unsigned simdWidth = sel.ctx.getSimdWidth(); unsigned int addrBytes = typeSize(address.type); unsigned valueNum = value.size(); GenRegister addrQ; if (addrBytes == 4) { if (simdWidth == 8) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, address); } else if (simdWidth == 16) { addrQ = address; } } else if (addrBytes == 8) { addrQ = address; } if (simdWidth == 8) { vector msg; msg.push_back(addrQ); for (unsigned k = 0; k < valueNum; k++) msg.push_back(value[k]); sel.UNTYPED_WRITEA64(msg.data(), valueNum+1, valueNum); } else if (simdWidth == 16) { vector msgs; for (unsigned k = 0; k < (valueNum+1)/2+1; k++) { msgs.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32)); } sel.push(); /* do first quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0)); for (unsigned k = 0; k < valueNum; k++) { sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 0)); } sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum); /* do second quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1)); for (unsigned k = 0; k < valueNum; k++) sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 1)); sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum); sel.pop(); } } void shootUntypedWriteMsg(Selection::Opaque &sel, const ir::StoreInstruction &insn, GenRegister &address, vector &value, ir::AddressSpace addrSpace) const { using namespace ir; unsigned int addrBytes = typeSize(address.type); unsigned valueNum = value.size(); AddressMode AM = insn.getAddressMode(); vector btiTemp = sel.getBTITemps(AM); if (AM == AM_DynamicBti || AM == AM_StaticBti) { if (AM == AM_DynamicBti) { Register btiReg = insn.getBtiReg(); sel.UNTYPED_WRITE(address, value.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp); } else { unsigned SI = insn.getSurfaceIndex(); sel.UNTYPED_WRITE(address, value.data(), valueNum, GenRegister::immud(SI), btiTemp); } } else if (addrSpace == ir::MEM_LOCAL) { GenRegister addr = address; if (addrBytes == 8) { addr = convertU64ToU32(sel, address); } sel.UNTYPED_WRITE(addr, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, address); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = address; if (addrBytes == 8) addrDW = convertU64ToU32(sel, address); sel.UNTYPED_WRITE(addrDW, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; untypedWriteStateless(sel, address, value); sel.pop(); } else { untypedWriteStateless(sel, address, value); } } void emitUntypedWrite(Selection::Opaque &sel, const ir::StoreInstruction &insn, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); vector value(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD); shootUntypedWriteMsg(sel, insn, address, value, addrSpace); } void write64Legacy(Selection::Opaque &sel, GenRegister address, vector &value, GenRegister bti, vector &btiTemp) const { using namespace ir; const uint32_t valueNum = value.size(); if (sel.hasLongType()) { vector tmp(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) { tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL); } sel.WRITE64(address, value.data(), tmp.data(), valueNum, bti, true, btiTemp); } else { sel.WRITE64(address, value.data(), NULL, valueNum, bti, false, btiTemp); } } void write64Stateless(Selection::Opaque &sel, GenRegister address, vector &value) const { using namespace ir; unsigned simdWidth = sel.ctx.getSimdWidth(); unsigned int addrBytes = typeSize(address.type); unsigned valueNum = value.size(); vector tmp(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) { tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL); } GenRegister addrQ; if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, address); } else { addrQ = address; } sel.push(); if (simdWidth == 8) { sel.WRITE64A64(addrQ, value.data(), tmp.data(), valueNum); } else { GenRegister tmpAddr, tmpSrc; tmpAddr = GenRegister::Qn(addrQ, 0); tmpSrc = GenRegister::Qn(value[0], 0); GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); /* SIMD16 long register is just enough for (SIMD8 A64 addr + SIMD8 long) */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr); sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc); sel.UNTYPED_WRITEA64(&tmp, 1, 2); tmpAddr = GenRegister::Qn(addrQ, 1); tmpSrc = GenRegister::Qn(value[0], 1); sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr); sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc); sel.UNTYPED_WRITEA64(&tmp, 1, 2); } sel.pop(); } void emitWrite64(Selection::Opaque &sel, const ir::StoreInstruction &insn, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); /* XXX support scalar only right now. */ GBE_ASSERT(valueNum == 1); vector src(valueNum); for (uint32_t valueID = 0; valueID < valueNum; ++valueID) src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64); AddressMode AM = insn.getAddressMode(); unsigned int addrBytes = typeSize(address.type); vector btiTemp = sel.getBTITemps(AM); if (AM != AM_Stateless) { GenRegister b; if (AM == AM_DynamicBti) { b = sel.selReg(insn.getBtiReg(), TYPE_U32); } else { b = GenRegister::immud(insn.getSurfaceIndex()); } write64Legacy(sel, address, src, b, btiTemp); } else if (addrSpace == MEM_LOCAL) { GenRegister b = GenRegister::immud(0xfe); GenRegister addr = address; if (addrBytes == 8) { addr = convertU64ToU32(sel, address); } write64Legacy(sel, addr, src, b, btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, address); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = address; if (addrBytes == 8) addrDW = convertU64ToU32(sel, address); write64Legacy(sel, addrDW, src, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; write64Stateless(sel, address, src); sel.pop(); } else { GBE_ASSERT(sel.hasLongType()); write64Stateless(sel, address, src); } } void byteScatterStateless(Selection::Opaque &sel, GenRegister address, GenRegister data, unsigned elemSize) const { using namespace ir; unsigned addrBytes = typeSize(address.type); unsigned simdWidth = sel.ctx.getSimdWidth(); GenRegister addrQ; if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, address); } else { addrQ = address; } if (simdWidth == 8) { GenRegister msg[2]; msg[0] = addrQ; msg[1] = data; sel.BYTE_SCATTERA64(msg, 2, elemSize); } else if (simdWidth == 16) { GenRegister msgs[2]; msgs[0] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32); msgs[1] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32); sel.push(); sel.curr.execWidth = 8; /* do first quarter */ sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0)); sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 0)); sel.BYTE_SCATTERA64(msgs, 2, elemSize); /* do second quarter */ sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1)); sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 1)); sel.BYTE_SCATTERA64(msgs, 2, elemSize); sel.pop(); } } void shootByteScatterMsg(Selection::Opaque &sel, const ir::StoreInstruction &insn, GenRegister address, GenRegister data, unsigned elemSize, ir::AddressSpace addrSpace) const { using namespace ir; unsigned addrBytes = typeSize(address.type); AddressMode AM = insn.getAddressMode(); vector btiTemp = sel.getBTITemps(AM); if (AM != AM_Stateless) { if (AM == AM_DynamicBti) { Register btiReg = insn.getBtiReg(); sel.BYTE_SCATTER(address, data, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp); } else { unsigned SI = insn.getSurfaceIndex(); sel.BYTE_SCATTER(address, data, elemSize, GenRegister::immud(SI), btiTemp); } } else if (addrSpace == ir::MEM_LOCAL) { GenRegister addr = address; if (addrBytes == 8) { addr = convertU64ToU32(sel, address); } sel.BYTE_SCATTER(addr, data, elemSize, GenRegister::immud(0xfe), btiTemp); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, address); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = address; if (addrBytes == 8) addrDW = convertU64ToU32(sel, address); sel.BYTE_SCATTER(addrDW, data, elemSize, GenRegister::immud(0xfe), btiTemp); sel.curr.inversePredicate = 1; byteScatterStateless(sel, address, data, elemSize); sel.pop(); } else { byteScatterStateless(sel, address, data, elemSize); } } void emitByteScatter(Selection::Opaque &sel, const ir::StoreInstruction &insn, const uint32_t elemSize, GenRegister address, ir::AddressSpace addrSpace, bool isUniform) const { using namespace ir; uint32_t valueNum = insn.getValueNum(); if(valueNum > 1) { const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType())); vector value(valueNum); if(elemSize == GEN_BYTE_SCATTER_WORD) { for(uint32_t i = 0; i < valueNum; i++) value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16); } else if(elemSize == GEN_BYTE_SCATTER_BYTE) { for(uint32_t i = 0; i < valueNum; i++) value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8); } uint32_t tmpRegNum = typeSize*valueNum / 4; vector tmp(tmpRegNum); for(uint32_t i = 0; i < tmpRegNum; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize); } shootUntypedWriteMsg(sel, insn, address, tmp, addrSpace); } else { const GenRegister value = sel.selReg(insn.getValue(0)); GBE_ASSERT(insn.getValueNum() == 1); const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); if (elemSize == GEN_BYTE_SCATTER_WORD) sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW)); else if (elemSize == GEN_BYTE_SCATTER_BYTE) sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB)); shootByteScatterMsg(sel, insn, address, tmp, elemSize, addrSpace); } } void emitOWordWrite(Selection::Opaque &sel, const ir::StoreInstruction &insn, GenRegister address, ir::AddressSpace addrSpace) const { using namespace ir; uint32_t SI = insn.getSurfaceIndex(); const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); const Type type = insn.getValueType(); const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; const RegisterFamily family = getFamily(type); bool isA64 = SI == 255; uint32_t offset_size = isA64 ? 128 : 8; const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); vector valuesVec; vector tmpVec; for(uint32_t i = 0; i < vec_size; i++) valuesVec.push_back(sel.selReg(insn.getValue(i), type)); GenRegister headeraddr; if (isA64) headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL); else headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD); // Make header sel.push(); { // Copy r0 into the header first sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(header, GenRegister::ud8grf(0, 0)); // Update the header with the current address sel.curr.execWidth = 1; // Put zero in the general state base address if (isA64) sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL)); else { sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4)); sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); } } sel.pop(); /* For block write we need to pack the block date into the tmp, and for different * simdwidth and vector size with different type size, we may need to spilt the * block write send message. * We can only get a send message with 5 reg length * so for different combination we have different message length and tmp vector size * | simd8 | simd16 | simd8 | simd16 * r0 |header | | | | * r1 |date | w0,w1 | w0 | dw0 | dw0 * r2 |date | w2,w3 | w1 | dw1 | dw0 * r3 |date | ...... | ...... | ...... | dw1 * r4 |date | ....... | ...... | ...... | dw1 */ uint32_t totalSize = simdWidth * typeSize * vec_size; uint32_t valueSize = simdWidth * typeSize; uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; uint32_t msg_num = vec_size / tmp_size; uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); for(uint32_t i = 0; i < tmp_size; i++) tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); for (uint32_t i = 0; i < msg_num; i++) { for (uint32_t j = 0; j < tmp_size; j++) sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]); if (i > 0) { sel.push(); { // Update the address in header sel.curr.execWidth = 1; sel.ADD(headeraddr, headeraddr, GenRegister::immud(offset_size)); } sel.pop(); } sel.push(); // In simd8 mode, when data reg has more than 1 reg, execWidth 8 will get wrong // result, so set the execWidth to 16. sel.curr.execWidth = 16; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size); sel.pop(); } } virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::StoreInstruction &insn = cast(dag.insn); Register reg = insn.getAddressRegister(); GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg))); AddressSpace addrSpace = insn.getAddressSpace(); const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(sel, type); const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0)); if (insn.isBlock()) this->emitOWordWrite(sel, insn, address, addrSpace); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) this->emitWrite64(sel, insn, address, addrSpace); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedWrite(sel, insn, address, addrSpace); else { this->emitByteScatter(sel, insn, elemSize, address, addrSpace, isUniform); } markAllChildren(dag); return true; } }; /*! Compare instruction pattern */ class CompareInstructionPattern : public SelectionPattern { public: CompareInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::CompareInstruction &insn = cast(dag.insn); const Opcode opcode = insn.getOpcode(); const Type type = insn.getType(); const Register dst = insn.getDst(0); GenRegister tmpDst; const BasicBlock *curr = insn.getParent(); const ir::Liveness &liveness = sel.ctx.getLiveness(); const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr); bool needStoreBool = false; if (liveOut.contains(dst) || dag.computeBool) needStoreBool = true; // why we set the tmpDst to null? // because for the listed type compare instruction could not // generate bool(uw) result to grf directly, we need an extra // select to generate the bool value to grf if(type == TYPE_S64 || type == TYPE_U64 || type == TYPE_DOUBLE || type == TYPE_FLOAT || type == TYPE_U32 || type == TYPE_S32 || type == TYPE_HALF /*|| (!needStoreBool)*/) tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F); else tmpDst = sel.selReg(dst, TYPE_BOOL); // Look for immediate values for the right source GenRegister src0, src1; bool inverseCmp = false; sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp); sel.push(); if (sel.isScalarReg(dst)) sel.curr.noMask = 1; sel.curr.physicalFlag = 0; sel.curr.modFlag = 1; sel.curr.flagIndex = dst.value(); sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean. if ((type == TYPE_S64 || type == TYPE_U64) && !sel.hasLongType()) { GenRegister tmp[3]; for(int i=0; i<3; i++) tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); sel.curr.flagGen = 1; sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp); } else if(opcode == OP_ORD) { sel.push(); sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.curr.flagGen = 1; sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); sel.pop(); } else { if((type == TYPE_S64 || type == TYPE_U64 || type == TYPE_DOUBLE || type == TYPE_FLOAT || type == TYPE_U32 || type == TYPE_S32 || type == TYPE_HALF)) sel.curr.flagGen = 1; else if (sel.isScalarReg(dst)) { // If the dest reg is a scalar bool, we can't set it as // dst register, as the execution width is still 8 or 16. // Instead, we set the needStoreBool to flagGen, and change // the dst to null register. And let the flag reg allocation // function to generate the flag grf on demand correctly latter. sel.curr.flagGen = needStoreBool; tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW); } sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst); } sel.pop(); return true; } }; /*! Bit cast instruction pattern */ DECL_PATTERN(BitCastInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const uint32_t dstNum = insn.getDstNum(); const uint32_t srcNum = insn.getSrcNum(); int index = 0, multiple, narrowNum, wideNum; bool narrowDst; Type narrowType; bool wideScalar = false; if(dstNum > srcNum) { multiple = dstNum / srcNum; narrowType = dstType; narrowNum = dstNum; wideNum = srcNum; narrowDst = 1; wideScalar = sel.isScalarReg(insn.getSrc(0)); } else { multiple = srcNum / dstNum; narrowType = srcType; narrowNum = srcNum; wideNum = dstNum; narrowDst = 0; wideScalar = sel.isScalarReg(insn.getDst(0)); } sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } // As we store long/ulong low/high part separately, // we need to deal with it separately, we need to change it back again // when hardware support native long type. const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64); const int simdWidth = sel.curr.execWidth; /* because we do not have hstride = 8, here, we need to seperate the long into top have and bottom half. */ vector tmp(wideNum); if (multiple == 8 && sel.hasLongType() && !wideScalar) { GBE_ASSERT(isInt64); // Must relate to long and char conversion. if (narrowDst) { for (int i = 0; i < wideNum; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); sel.UNPACK_LONG(tmp[i], sel.selReg(insn.getSrc(i), srcType)); } } else { for (int i = 0; i < wideNum; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); } } } for(int i = 0; i < narrowNum; i++, index++) { GenRegister narrowReg, wideReg; if (multiple == 8 && sel.hasLongType() && !wideScalar) { if(narrowDst) { narrowReg = sel.selReg(insn.getDst(i), narrowType); wideReg = GenRegister::retype(tmp[index/multiple], narrowType); //retype to narrow type } else { wideReg = GenRegister::retype(tmp[index/multiple], narrowType); narrowReg = sel.selReg(insn.getSrc(i), narrowType); //retype to narrow type } } else { if(narrowDst) { narrowReg = sel.selReg(insn.getDst(i), narrowType); wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType); //retype to narrow type } else { wideReg = sel.selReg(insn.getDst(index/multiple), narrowType); narrowReg = sel.selReg(insn.getSrc(i), narrowType); //retype to narrow type } } // set correct horizontal stride if(wideReg.hstride != GEN_HORIZONTAL_STRIDE_0) { if(multiple == 2) { if (sel.hasLongType() && isInt64) { // long to int or int to long wideReg = sel.unpacked_ud(wideReg.reg()); wideReg = GenRegister::retype(wideReg, getGenType(narrowType)); } else { wideReg = sel.unpacked_uw(wideReg.reg()); wideReg = GenRegister::retype(wideReg, getGenType(narrowType)); if(isInt64) { wideReg.width = GEN_WIDTH_8; wideReg.hstride = GEN_HORIZONTAL_STRIDE_1; wideReg.vstride = GEN_VERTICAL_STRIDE_8; } } } else if(multiple == 4) { if (sel.hasLongType() && isInt64) { // long to short or short to long wideReg = sel.unpacked_uw(wideReg.reg()); wideReg = GenRegister::retype(wideReg, getGenType(narrowType)); } else { wideReg = sel.unpacked_ub(wideReg.reg()); wideReg = GenRegister::retype(wideReg, getGenType(narrowType)); if(isInt64) { wideReg.hstride = GEN_HORIZONTAL_STRIDE_2; wideReg.vstride = GEN_VERTICAL_STRIDE_16; } } } else if(multiple == 8) { // we currently store high/low 32bit separately in register, // so, its hstride is 4 here. wideReg = sel.unpacked_ub(wideReg.reg()); wideReg = GenRegister::retype(wideReg, getGenType(narrowType)); } else { GBE_ASSERT(0); } } if((!isInt64 || (sel.hasLongType() && multiple != 8)) && index % multiple) { wideReg = sel.getOffsetReg(wideReg, 0, (index % multiple) * typeSize(wideReg.type)); } if(isInt64 && (multiple == 8 || !sel.hasLongType())) { // Offset to next half if((i % multiple) >= multiple/2) wideReg = sel.getOffsetReg(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4); // Offset to desired narrow element in wideReg if(index % (multiple/2)) wideReg = sel.getOffsetReg(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type)); } GenRegister xdst = narrowDst ? narrowReg : wideReg; GenRegister xsrc = narrowDst ? wideReg : narrowReg; if(isInt64) { sel.MOV(xdst, xsrc); } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) { sel.push(); sel.curr.execWidth = 8; for(int i = 0; i < simdWidth/4; i ++) { sel.curr.chooseNib(i); sel.MOV(xdst, xsrc); xdst = sel.getOffsetReg(xdst, 0, 4 * typeSize(getGenType(dstType))); xsrc = sel.getOffsetReg(xsrc, 0, 4 * typeSize(getGenType(srcType))); } sel.pop(); } else sel.MOV(xdst, xsrc); } if (multiple == 8 && sel.hasLongType() && !wideScalar && !narrowDst) { for (int i = 0; i < wideNum; i++) { sel.PACK_LONG(sel.selReg(insn.getDst(i), dstType), tmp[i]); } } sel.pop(); return true; } DECL_CTOR(BitCastInstruction, 1, 1); }; /*! Convert instruction pattern */ DECL_PATTERN(ConvertInstruction) { INLINE bool lowerI64Reg(Selection::Opaque &sel, SelectionDAG *dag, GenRegister &src, uint32_t type) const { using namespace ir; GBE_ASSERT(type == GEN_TYPE_UD || type == GEN_TYPE_F); if (dag->insn.getOpcode() == OP_LOADI) { const auto &immInsn = cast(dag->insn); const auto imm = immInsn.getImmediate(); const Type immType = immInsn.getType(); if (immType == TYPE_S64 && imm.getIntegerValue() <= INT_MAX && imm.getIntegerValue() >= INT_MIN) { src = GenRegister::immd((int32_t)imm.getIntegerValue()); return true; } else if (immType == TYPE_U64 && imm.getIntegerValue() <= UINT_MAX) { src = GenRegister::immud((uint32_t)imm.getIntegerValue()); return true; } } else if (dag->insn.getOpcode() == OP_CVT) { const auto cvtInsn = cast(dag->insn); auto srcType = cvtInsn.getSrcType(); if (((srcType == TYPE_U32 || srcType == TYPE_S32) && (type == GEN_TYPE_UD || type == GEN_TYPE_D)) || ((srcType == TYPE_FLOAT) && type == GEN_TYPE_F)) { src = GenRegister::retype(sel.selReg(cvtInsn.getSrc(0), srcType), type); dag->isRoot = 1; return true; } else if (srcType == TYPE_FLOAT || srcType == TYPE_U16 || srcType == TYPE_S16 || srcType == TYPE_U32 || srcType == TYPE_S32) { src = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32), type); dag->isRoot = 1; sel.MOV(src, sel.selReg(cvtInsn.getSrc(0), srcType)); return true; } } return false; } INLINE void convertBetweenHalfFloat(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const Opcode opcode = insn.getOpcode(); if (opcode == OP_F16TO32) { sel.F16TO32(dst, src); } else if (opcode == OP_F32TO16) { // We need two instructions to make the conversion GenRegister unpacked; unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.F32TO16(unpacked, src); sel.pop(); sel.MOV(dst, unpacked); } else { GBE_ASSERT("Not conversion between float and half\n"); } } INLINE void convert32bitsToSmall(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); GenRegister unpacked; const RegisterFamily dstFamily = getFamily(dstType); if (dstFamily == FAMILY_WORD) { uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; /* The special case, when dst is half, float->word->half will lose accuracy. */ if (dstType == TYPE_HALF) { GBE_ASSERT(sel.hasHalfType()); type = GEN_TYPE_HF; } if (!sel.isScalarReg(dst.reg())) { unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, type); } else unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type); } else { const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B; if (!sel.isScalarReg(dst.reg())) { unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, type); } else unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type); } sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); if (unpacked.reg() != dst.reg()) sel.MOV(dst, unpacked); } INLINE void convertI64To16bits(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); if (dstType == TYPE_HALF) { /* There is no MOV for Long <---> Half. So Long-->Float-->half. */ GBE_ASSERT(sel.hasLongType()); GBE_ASSERT(sel.hasHalfType()); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } GenRegister funpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); funpacked = GenRegister::retype(funpacked, GEN_TYPE_F); sel.MOV(funpacked, src); GenRegister ftmp = sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); ftmp = GenRegister::retype(ftmp, GEN_TYPE_F); sel.MOV(ftmp, funpacked); GenRegister unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, GEN_TYPE_HF); sel.MOV(unpacked, ftmp); sel.pop(); sel.MOV(dst, unpacked); } else { uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; GenRegister unpacked; if (!sel.isScalarReg(dst.reg())) { if (sel.hasLongType()) { unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); } else { unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); } unpacked = GenRegister::retype(unpacked, type); } else { unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type); } if(!sel.hasLongType()) { GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD)); tmp.type = GEN_TYPE_D; sel.CONVI64_TO_I(tmp, src); sel.MOV(unpacked, tmp); } else { sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); } if (unpacked.reg() != dst.reg()) { sel.MOV(dst, unpacked); } } } INLINE void convertI64ToI8(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); GenRegister unpacked; const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B; if (sel.hasLongType()) { // handle the native long logic. if (!sel.isScalarReg(dst.reg())) { /* When convert i64 to i8, the hstride should be 8, but the hstride do not support more than 4, so we need to split it to 2 steps. */ unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); } else { unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type); } sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); if (unpacked.reg() != dst.reg()) { sel.MOV(dst, unpacked); } } else { // Do not have native long if (!sel.isScalarReg(dst.reg())) { unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, type); } else { unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type); } GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD)); tmp.type = GEN_TYPE_D; sel.CONVI64_TO_I(tmp, src); sel.MOV(unpacked, tmp); if (unpacked.reg() != dst.reg()) { sel.MOV(dst, unpacked); } } } INLINE void convertI64ToI32(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); if (sel.hasLongType()) { GenRegister unpacked; const uint32_t type = dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D; if (!sel.isScalarReg(dst.reg())) { unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D); } else { unpacked = GenRegister::retype(sel.unpacked_ud(dst.reg()), type); } sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); if (unpacked.reg() != dst.reg()) { sel.MOV(dst, unpacked); } } else { sel.CONVI64_TO_I(dst, src); } } INLINE void convertI64ToFloat(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); auto dag = sel.regDAG[src.reg()]; // FIXME, in the future, we need to do a common I64 lower to I32 analysis // at llvm IR layer which could cover more cases then just this one. SelectionDAG *dag0, *dag1; if (dag && dag->child[0] && dag->child[1]) { if (dag->child[0]->insn.getOpcode() == OP_LOADI) { dag0 = dag->child[1]; dag1 = dag->child[0]; } else { dag0 = dag->child[0]; dag1 = dag->child[1]; } GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI && dag->child[1]->insn.getOpcode() == OP_LOADI)); if (dag->insn.getOpcode() == OP_AND || dag->insn.getOpcode() == OP_OR || dag->insn.getOpcode() == OP_XOR) { GenRegister src0; GenRegister src1; if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) && lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) { switch (dag->insn.getOpcode()) { default: case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break; case OP_OR: sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break; case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break; } sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD)); markChildren = false; return; } } } if (!sel.hasLongType()) { GenRegister tmp[6]; for(int i=0; i<6; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.CONVI64_TO_F(dst, src, tmp); sel.pop(); } else { GenRegister unpacked; const uint32_t type = GEN_TYPE_F; unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, type); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); if (unpacked.reg() != dst.reg()) { sel.MOV(dst, unpacked); } } } INLINE void convertSmallIntsToI64(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const RegisterFamily srcFamily = getFamily(srcType); if (sel.hasLongType() && sel.hasLongRegRestrict()) { // Convert i32/i16/i8 to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword). GenRegister unpacked; GenRegister unpacked_src = src; sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } if(srcFamily == FAMILY_DWORD) { unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D); } else if(srcFamily == FAMILY_WORD) { unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W); } else if(srcFamily == FAMILY_BYTE) { GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0)))); tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); sel.MOV(tmp, src); unpacked_src = tmp; } else GBE_ASSERT(0); sel.MOV(unpacked, unpacked_src); sel.pop(); sel.MOV(dst, unpacked); } else if (sel.hasLongType()) { sel.MOV(dst, src); } else { sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD))); } } INLINE void convertFToI64(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); if (sel.hasLongType() && sel.hasLongRegRestrict() && srcType == ir::TYPE_FLOAT) { // typical bsw float->long case // Convert float to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword). GenRegister unpacked; GenRegister unpacked_src = src; sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, GEN_TYPE_F); sel.MOV(unpacked, unpacked_src); sel.pop(); sel.MOV(dst, unpacked); } else if (srcType == ir::TYPE_FLOAT) { if (sel.hasLongType()) { // typical bdw float->long case sel.MOV(dst, src); } else { // typical old platform float->long case GenRegister tmp[2]; tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT); sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.CONVF_TO_I64(dst, src, tmp); sel.pop(); } } else if (srcType == ir::TYPE_HALF) { /* No need to consider old platform. if we support half, we must have native long. */ GBE_ASSERT(sel.hasLongType()); GBE_ASSERT(sel.hasHalfType()); uint32_t type = dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D; GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))), TYPE_U32), type); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(tmp, src); if (sel.hasLongRegRestrict()) { // special for BSW case. GenRegister unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, type); sel.MOV(unpacked, tmp); sel.pop(); sel.MOV(dst, unpacked); } else { sel.pop(); sel.MOV(dst, tmp); } } else if (src.type == GEN_TYPE_DF) { GBE_ASSERT(sel.hasDoubleType()); GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long. // Just Mov sel.MOV(dst, src); } else { /* Invalid case. */ GBE_ASSERT(0); } } INLINE void convertBetweenFloatDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); GBE_ASSERT(sel.hasDoubleType()); if (sel.isScalarReg(insn.getDst(0))) { // dst is scalar, just MOV and nothing more. GBE_ASSERT(sel.isScalarReg(insn.getSrc(0))); sel.MOV(dst, src); } else if (srcType == ir::TYPE_DOUBLE) { // double to float GBE_ASSERT(dstType == ir::TYPE_FLOAT); GenRegister unpacked; unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, GEN_TYPE_F); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); sel.MOV(dst, unpacked); } else { // float to double, just mov sel.MOV(dst, src); } return; } INLINE void convertBetweenHalfDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); GBE_ASSERT(sel.hasDoubleType()); GBE_ASSERT(sel.hasHalfType()); //So far, if we support double, we support half. if (sel.isScalarReg(insn.getDst(0))) { // uniform case. GBE_ASSERT(sel.isScalarReg(insn.getSrc(0))); GBE_ASSERT(sel.curr.execWidth == 1); GenRegister tmpFloat = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_F); sel.MOV(tmpFloat, src); sel.MOV(dst, tmpFloat); return; } if (dstType == ir::TYPE_DOUBLE) { // half to double. There is no direct double to half MOV, need tmp float. GBE_ASSERT(srcType == ir::TYPE_HALF); GenRegister tmpFloat = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_F); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(tmpFloat, src); sel.pop(); sel.MOV(dst, tmpFloat); } else { // double to half. No direct MOV from double to half, so double->float->half GBE_ASSERT(srcType == ir::TYPE_DOUBLE); GBE_ASSERT(dstType == ir::TYPE_HALF); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } // double to float GenRegister unpackedFloat = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpackedFloat = GenRegister::retype(unpackedFloat, GEN_TYPE_F); sel.MOV(unpackedFloat, src); // float to half GenRegister unpackedHalf = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpackedHalf = GenRegister::retype(unpackedHalf, GEN_TYPE_HF); sel.MOV(unpackedHalf, unpackedFloat); sel.pop(); sel.MOV(dst, unpackedHalf); } } INLINE void convertHalfToSmallInts(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const RegisterFamily dstFamily = getFamily(dstType); // Special case, half -> char/short. /* [DevBDW+]: Format conversion to or from HF (Half Float) must be DWord-aligned and strided by a DWord on the destination. */ GBE_ASSERT(sel.hasHalfType()); GenRegister tmp; sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } if (dstFamily == FAMILY_BYTE) { const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B; tmp = GenRegister::retype(sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type); sel.MOV(tmp, src); } else { const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type); sel.MOV(tmp, src); } sel.pop(); sel.MOV(dst, tmp); } INLINE void convertSmallIntsToHalf(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); // Special case, char/uchar -> half /* [DevBDW+]: Format conversion to or from HF (Half Float) must be DWord-aligned and strided by a DWord on the destination. */ GBE_ASSERT(sel.hasHalfType()); GenRegister tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), GEN_TYPE_HF); sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(tmp, src); sel.pop(); sel.MOV(dst, tmp); } INLINE void convertDoubleToSmallInts(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const RegisterFamily dstFamily = getFamily(dstType); GBE_ASSERT(sel.hasDoubleType()); GBE_ASSERT(sel.hasHalfType()); //So far, if we support double, we support half. if (sel.isScalarReg(insn.getDst(0))) { // dst is scalar, just MOV and nothing more. GBE_ASSERT(sel.isScalarReg(insn.getSrc(0))); sel.MOV(dst, src); } else { GenRegister unpacked; if (dstFamily == FAMILY_DWORD) { // double to int unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D); } else if (dstFamily == FAMILY_WORD) { // double to short unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W); } else { GBE_ASSERT(dstFamily == FAMILY_BYTE); // double to char unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); } sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(unpacked, src); sel.pop(); sel.MOV(dst, unpacked); } } INLINE void convertI64ToDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); GBE_ASSERT(sel.hasDoubleType()); GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long. // Just Mov sel.MOV(dst, src); } INLINE void convertSmallIntsToDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const RegisterFamily srcFamily = getFamily(srcType); GBE_ASSERT(sel.hasDoubleType()); GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long. if (sel.hasLongType() && sel.hasLongRegRestrict()) { // Convert i32/i16/i8 to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword). GenRegister unpacked; GenRegister unpacked_src = src; sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } if(srcFamily == FAMILY_DWORD) { unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D); } else if(srcFamily == FAMILY_WORD) { unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W); } else if(srcFamily == FAMILY_BYTE) { GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0)))); tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0)))); unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W); sel.MOV(tmp, src); unpacked_src = tmp; } else GBE_ASSERT(0); sel.MOV(unpacked, unpacked_src); sel.pop(); sel.MOV(dst, unpacked); } else if (sel.hasLongType()) { sel.MOV(dst, src); } } INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const { using namespace ir; const Type dstType = insn.getDstType(); const Type srcType = insn.getSrcType(); const RegisterFamily dstFamily = getFamily(dstType); const RegisterFamily srcFamily = getFamily(srcType); const GenRegister dst = sel.selReg(insn.getDst(0), dstType); const GenRegister src = sel.selReg(insn.getSrc(0), srcType); const Opcode opcode = insn.getOpcode(); sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } if(opcode == ir::OP_SAT_CVT) sel.curr.saturate = 1; if (opcode == OP_F16TO32 || opcode == OP_F32TO16) { /* Conversion between float and half. */ convertBetweenHalfFloat(sel, insn, markChildren); } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) { //convert i32/float to small int/half convert32bitsToSmall(sel, insn, markChildren); } else if (dstFamily == FAMILY_WORD && srcFamily == FAMILY_QWORD && srcType != ir::TYPE_DOUBLE) { //convert i64 to i16 and half. convertI64To16bits(sel, insn, markChildren); } else if (dstFamily == FAMILY_BYTE && srcFamily == FAMILY_QWORD && srcType != ir::TYPE_DOUBLE) { //convert i64 to i8 convertI64ToI8(sel, insn, markChildren); } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) { // Convert i64 to i32 convertI64ToI32(sel, insn, markChildren); } else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) { // long -> float convertI64ToFloat(sel, insn, markChildren); } else if (dstType == ir::TYPE_DOUBLE && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) { // long -> double convertI64ToDouble(sel, insn, markChildren); } else if ((dstType == ir::TYPE_U64 || dstType == ir::TYPE_S64) && (srcFamily != FAMILY_QWORD && srcType != ir::TYPE_FLOAT && srcType != ir::TYPE_HALF)) { // int/short/char to long convertSmallIntsToI64(sel, insn, markChildren); } else if ((dstType == ir::TYPE_DOUBLE) && (srcFamily != FAMILY_QWORD && srcType != ir::TYPE_FLOAT && srcType != ir::TYPE_HALF)) { // int/short/char to double convertSmallIntsToDouble(sel, insn, markChildren); } else if ((dstType == ir::TYPE_U64 || dstType == ir::TYPE_S64) && (srcType == ir::TYPE_FLOAT || srcType == ir::TYPE_HALF || srcType == ir::TYPE_DOUBLE)) { // All float type to long convertFToI64(sel, insn, markChildren); } else if ((srcType == ir::TYPE_FLOAT && dstType == ir::TYPE_DOUBLE) || (dstType == ir::TYPE_FLOAT && srcType == ir::TYPE_DOUBLE)) { // float and double conversion convertBetweenFloatDouble(sel, insn, markChildren); } else if ((srcType == ir::TYPE_HALF && dstType == ir::TYPE_DOUBLE) || (dstType == ir::TYPE_HALF && srcType == ir::TYPE_DOUBLE)) { // float and half conversion convertBetweenHalfDouble(sel, insn, markChildren); } else if (srcType == ir::TYPE_DOUBLE && dstType != ir::TYPE_FLOAT && dstType != ir::TYPE_HALF && dstFamily != FAMILY_QWORD) { // double to int/short/char convertDoubleToSmallInts(sel, insn, markChildren); } else if (srcType == ir::TYPE_HALF && (dstFamily == FAMILY_BYTE || dstFamily == FAMILY_WORD)) { // Convert half to small int convertHalfToSmallInts(sel, insn, markChildren); } else if (dstType == ir::TYPE_HALF && (srcFamily == FAMILY_BYTE || srcFamily == FAMILY_WORD)) { // Convert small int to half convertSmallIntsToHalf(sel, insn, markChildren); } else { /* All special cases has been handled, just MOV. */ sel.MOV(dst, src); } sel.pop(); return true; } DECL_CTOR(ConvertInstruction, 1, 1); }; /*! atomic instruction pattern */ class AtomicInstructionPattern : public SelectionPattern { public: AtomicInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } /* Used to transform address from 64bit to 32bit, note as dataport messages * cannot accept scalar register, so here to convert to non-uniform * register here. */ GenRegister convertU64ToU32(Selection::Opaque &sel, GenRegister addr) const { GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD); GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32); sel.MOV(dst, unpacked); return dst; } void untypedAtomicA64Stateless(Selection::Opaque &sel, const ir::AtomicInstruction &insn, unsigned msgPayload, GenRegister dst, GenRegister addr, GenRegister src1, GenRegister src2, GenRegister bti) const { using namespace ir; GenRegister addrQ; const AtomicOps atomicOp = insn.getAtomicOpcode(); GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp; unsigned addrBytes = typeSize(addr.type); GBE_ASSERT(msgPayload <= 3); unsigned simdWidth = sel.curr.execWidth; AddressMode AM = insn.getAddressMode(); if (addrBytes == 4) { addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64); sel.MOV(addrQ, addr); } else { addrQ = addr; } if (simdWidth == 8) { vector msgs; msgs.push_back(addr); msgs.push_back(src1); msgs.push_back(src2); sel.ATOMICA64(dst, genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM)); } else if (simdWidth == 16) { vector msgs; RegisterFamily family = sel.getRegisterFamily(insn.getDst(0)); Type type = getType(family); for (unsigned k = 0; k < msgPayload; k++) { msgs.push_back(sel.selReg(sel.reg(family), type)); } sel.push(); /* first quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q1; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0)); if(msgPayload > 1) { if(family == ir::FAMILY_QWORD) sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 0)); else sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 0)); } if(msgPayload > 2) { if(family == ir::FAMILY_QWORD) sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 0)); else sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 0)); } sel.ATOMICA64(GenRegister::Qn(dst, 0), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM)); /* second quarter */ sel.curr.execWidth = 8; sel.curr.quarterControl = GEN_COMPRESSION_Q2; sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1)); if(msgPayload > 1) { if(family == ir::FAMILY_QWORD) sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 1)); else sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 1)); } if(msgPayload > 2) { if(family == ir::FAMILY_QWORD) sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 1)); else sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 1)); } sel.ATOMICA64(GenRegister::Qn(dst, 1), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM)); sel.pop(); } } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::AtomicInstruction &insn = cast(dag.insn); const AtomicOps atomicOp = insn.getAtomicOpcode(); unsigned srcNum = insn.getSrcNum(); unsigned msgPayload; Register reg = insn.getAddressRegister(); GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg))); AddressSpace addrSpace = insn.getAddressSpace(); GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL || insn.getAddressSpace() == MEM_PRIVATE || insn.getAddressSpace() == MEM_LOCAL || insn.getAddressSpace() == MEM_GENERIC || insn.getAddressSpace() == MEM_MIXED); unsigned addrBytes = typeSize(address.type); AddressMode AM = insn.getAddressMode(); if (AM == AM_DynamicBti) { msgPayload = srcNum - 1; } else { msgPayload = srcNum; } Type type = getType(sel.getRegisterFamily(insn.getDst(0))); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src0 = sel.selReg(insn.getAddressRegister(), type); GenRegister src1 = src0, src2 = src0; if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), type); if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), type); GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp; if (AM == AM_DynamicBti || AM == AM_StaticBti) { if (AM == AM_DynamicBti) { Register btiReg = insn.getBtiReg(); sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, sel.selReg(btiReg, type), sel.getBTITemps(AM)); } else { unsigned SI = insn.getSurfaceIndex(); sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, GenRegister::immud(SI), sel.getBTITemps(AM)); } } else if (addrSpace == ir::MEM_LOCAL) { // stateless mode, local still use bti access GenRegister addrDW = address; if (addrBytes == 8) addrDW = convertU64ToU32(sel, address); sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM)); } else if (addrSpace == ir::MEM_GENERIC) { Register localMask = generateLocalMask(sel, address); sel.push(); sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL); GenRegister addrDW = address; if (addrBytes == 8) addrDW = convertU64ToU32(sel, address); sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM)); sel.curr.inversePredicate = 1; untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff)); sel.pop(); } else untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff)); markAllChildren(dag); return true; } }; /*! Select instruction pattern */ class SelectInstructionPattern : public SelectionPattern { public: SelectInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::SelectInstruction &insn = cast(dag.insn); // Get all registers for the instruction const Type type = insn.getType(); const GenRegister dst = sel.selReg(insn.getDst(0), type); // Look for immediate values for the right source GenRegister src0, src1; SelectionDAG *dag0 = dag.child[0]; // source 0 is the predicate! SelectionDAG *dag1 = dag.child[1]; SelectionDAG *dag2 = dag.child[2]; if (dag0) dag0->isRoot = 1; bool inverse = false; sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse); const Register pred = insn.getPredicate(); sel.push(); if (sel.isScalarReg(insn.getDst(0)) == true) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.curr.inversePredicate ^= inverse; sel.curr.physicalFlag = 0; sel.curr.flagIndex = pred.value(); sel.curr.predicate = GEN_PREDICATE_NORMAL; // FIXME in general, if the flag is a uniform flag. // we should treat that flag as extern flag, as we // never genrate a uniform physical flag. As we can // never predicate which channel is active when this // flag is used. // We need to concentrate this logic to the modFlag bit. // If an instruction has that bit, it will generate physical // flag, otherwise it will not. But current modFlag is // just a hint. We need to fix it in the future. if (!dag0 || (sel.isScalarReg(dag0->insn.getDst(0)))) sel.curr.externFlag = 1; if((type == ir::TYPE_S64 || type == ir::TYPE_U64) && !sel.hasLongType()) sel.SEL_INT64(dst, src0, src1); else sel.SEL(dst, src0, src1); sel.pop(); return true; } }; DECL_PATTERN(TernaryInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn, bool &markChildren) const { using namespace ir; const Type type = insn.getType(); const GenRegister dst = sel.selReg(insn.getDst(0), type), src0 = sel.selReg(insn.getSrc(0), type), src1 = sel.selReg(insn.getSrc(1), type), src2 = sel.selReg(insn.getSrc(2), type); switch(insn.getOpcode()) { case OP_I64MADSAT: { GenRegister tmp[9]; int tmp_num; if (!sel.hasLongType()) { tmp_num = 9; for(int i=0; i<9; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); tmp[i].type = GEN_TYPE_UD; } } else { tmp_num = 6; for(int i=0; i<6; i++) { tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); tmp[i].type = GEN_TYPE_UL; } } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.I64MADSAT(dst, src0, src1, src2, tmp, tmp_num); sel.pop(); break; } case OP_MAD: { sel.push(); if (sel.isScalarReg(insn.getDst(0))) sel.curr.execWidth = 1; sel.MAD(dst, src2, src0, src1); sel.pop(); break; } case OP_LRP: { sel.LRP(dst, src0, src1, src2); break; } default: NOT_IMPLEMENTED; } return true; } DECL_CTOR(TernaryInstruction, 1, 1); }; /*! Label instruction pattern */ DECL_PATTERN(LabelInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn, bool &markChildren) const { using namespace ir; const LabelIndex label = insn.getLabelIndex(); const GenRegister src0 = sel.getBlockIP(); const GenRegister src1 = sel.getLabelImmReg(label); const uint32_t simdWidth = sel.ctx.getSimdWidth(); GBE_ASSERTM(label < sel.ctx.getMaxLabel(), "We reached the maximum label number which is reserved for barrier handling"); sel.LABEL(label); if(!insn.getParent()->needIf) return true; // Do not emit any code for the "returning" block. There is no need for it if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock()) return true; LabelIndex jip; const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex(); if (sel.ctx.hasJIP(&insn)) jip = sel.ctx.getLabelIndex(&insn); else jip = nextLabel; // Emit the mask computation at the head of each basic block sel.push(); sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.flag = 0; sel.curr.subFlag = 1; sel.cmpBlockIP(GEN_CONDITIONAL_LE, src0, src1); sel.pop(); if (sel.block->hasBarrier) { // If this block has barrier, we don't execute the block until all lanes // are 1s. Set each reached lane to 1, then check all lanes. If there is any // lane not reached, we jump to jip. And no need to issue if/endif for // this block, as it will always excute with all lanes activated. sel.push(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.curr.flag = 0; sel.curr.subFlag = 1; sel.setBlockIP(src0, sel.ctx.getMaxLabel()); sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.cmpBlockIP(GEN_CONDITIONAL_EQ, src0, sel.ctx.getMaxLabel()); if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; else if (simdWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; else NOT_IMPLEMENTED; sel.curr.noMask = 1; sel.curr.execWidth = 1; sel.curr.inversePredicate = 1; sel.JMPI(GenRegister::immd(0), jip, label); sel.pop(); // FIXME, if the last BRA is unconditional jump, we don't need to update the label here. sel.push(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.curr.flag = 0; sel.curr.subFlag = 1; sel.setBlockIP(src0, label.value()); sel.pop(); } else { if (sel.ctx.hasJIP(&insn) && // If jump to next label and the endif offset is -1, then // We don't need to add a jmpi here, as the following IF will do the same // thing if all channels are disabled. (jip != nextLabel || sel.block->endifOffset != -1)) { // If it is required, insert a JUMP to bypass the block sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; else if (simdWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; else NOT_IMPLEMENTED; sel.curr.noMask = 1; sel.curr.execWidth = 1; sel.curr.inversePredicate = 1; sel.JMPI(GenRegister::immd(0), jip, label); sel.pop(); } sel.push(); sel.curr.flag = 0; sel.curr.subFlag = 1; sel.curr.predicate = GEN_PREDICATE_NORMAL; if(!insn.getParent()->needEndif && insn.getParent()->needIf) { ir::LabelIndex label = insn.getParent()->endifLabel; sel.IF(GenRegister::immd(0), label, label); } else sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel); sel.pop(); } return true; } DECL_CTOR(LabelInstruction, 1, 1); }; DECL_PATTERN(SampleInstruction) { INLINE void emitLd_ivb(Selection::Opaque &sel, const ir::SampleInstruction &insn, GenRegister msgPayloads[4], uint32_t &msgLen) const { // pre SKL: U, lod, [V], [W] using namespace ir; GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT); uint32_t srcNum = insn.getSrcNum(); msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType()); msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); sel.MOV(msgPayloads[1], GenRegister::immud(0)); if (srcNum > 1) msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType()); if (srcNum > 2) msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType()); // Clear the lod to zero. msgLen = srcNum + 1; } INLINE void emitLd_skl(Selection::Opaque &sel, const ir::SampleInstruction &insn, GenRegister msgPayloads[4], uint32_t &msgLen) const { // SKL: U, [V], [lod], [W] using namespace ir; GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT); uint32_t srcNum = msgLen = insn.getSrcNum(); msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType()); if (srcNum > 1) msgPayloads[1] = sel.selReg(insn.getSrc(1), insn.getSrcType()); if (srcNum > 2) { // Clear the lod to zero. msgPayloads[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); sel.MOV(msgPayloads[2], GenRegister::immud(0)); msgLen += 1; msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType()); } } INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn, bool &markChildren) const { using namespace ir; GenRegister msgPayloads[4]; vector dst(insn.getDstNum()); uint32_t srcNum = insn.getSrcNum(); uint32_t valueID = 0; uint32_t msgLen = 0; for (valueID = 0; valueID < insn.getDstNum(); ++valueID) dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); if (insn.getSamplerOffset() != 0) { if(sel.getLdMsgOrder() < LD_MSG_ORDER_SKL) this->emitLd_ivb(sel, insn, msgPayloads, msgLen); else this->emitLd_skl(sel, insn, msgPayloads, msgLen); } else { // U, V, [W] GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT); for (valueID = 0; valueID < srcNum; ++valueID) msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); msgLen = srcNum; } // We switch to a fixup bti for linear filter on a image1d array sampling. uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_WORKAROUND_IMAGE_OFFSET : 0); if (bti > BTI_MAX_ID) { std::cerr << "Too large bti " << bti; return false; } uint32_t sampler = insn.getSamplerIndex(); sel.SAMPLE(dst.data(), insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false); return true; } DECL_CTOR(SampleInstruction, 1, 1); }; DECL_PATTERN(VmeInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::VmeInstruction &insn, bool &markChildren) const { using namespace ir; uint32_t msg_type, vme_search_path_lut, lut_sub; msg_type = insn.getMsgType(); vme_search_path_lut = 0; lut_sub = 0; GBE_ASSERT(msg_type == 1); uint32_t payloadLen = 0; //We allocate 5 virtual payload grfs to selection dst register. if(msg_type == 1){ payloadLen = 5; } uint32_t selDstNum = insn.getDstNum() + payloadLen; uint32_t srcNum = insn.getSrcNum(); vector dst(selDstNum); vector payloadVal(srcNum); uint32_t valueID = 0; for (valueID = 0; valueID < insn.getDstNum(); ++valueID) dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID) dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); for (valueID = 0; valueID < srcNum; ++valueID) payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); uint32_t bti = insn.getImageIndex(); if (bti > BTI_MAX_ID) { std::cerr << "Too large bti " << bti; return false; } sel.VME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type, vme_search_path_lut, lut_sub); return true; } DECL_CTOR(VmeInstruction, 1, 1); }; DECL_PATTERN(ImeInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::ImeInstruction &insn, bool &markChildren) const { using namespace ir; uint32_t msg_type; msg_type = insn.getMsgType(); GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3); uint32_t payloadLen = 0; if(msg_type == 2){ payloadLen = 6; } else if(msg_type == 1 || msg_type == 3){ payloadLen = 8; } uint32_t selDstNum = insn.getDstNum() + payloadLen; uint32_t srcNum = insn.getSrcNum(); vector dst(selDstNum); vector payloadVal(srcNum); uint32_t valueID = 0; for (valueID = 0; valueID < insn.getDstNum(); ++valueID) dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType()); for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID) dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); for (valueID = 0; valueID < srcNum; ++valueID) payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType()); uint32_t bti = insn.getImageIndex() + BTI_WORKAROUND_IMAGE_OFFSET; if (bti > BTI_MAX_ID) { std::cerr << "Too large bti " << bti; return false; } sel.IME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type); return true; } DECL_CTOR(ImeInstruction, 1, 1); }; /*! Typed write instruction pattern. */ DECL_PATTERN(TypedWriteInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const { const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)); sel.push(); sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(header, GenRegister::immud(0)); sel.curr.execWidth = 1; GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4); // Enable all channels. sel.MOV(channelEn, GenRegister::immud(0xffff)); sel.pop(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); if (simdWidth == 16) emitWithSimd16(sel, insn, markChildren, header); else if (simdWidth == 8) emitWithSimd8(sel, insn, markChildren, header); else NOT_SUPPORTED; return true; } INLINE bool emitWithSimd16(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const { using namespace ir; GenRegister msgs[9]; // (header + U + V + W + LOD + 4) msgs[0] = header; for (uint32_t i = 1; i < 9; ++i) { //SIMD16 will be split into two SIMD8, //each virtual reg in msgs requires one physical reg with 8 DWORDs (32 bytes), //so, declare with FAMILY_WORD, and the allocated size will be sizeof(WORD)*SIMD16 = 32 bytes msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32); } const uint32_t dims = insn.getSrcNum() - 4; uint32_t bti = insn.getImageIndex(); sel.push(); sel.curr.execWidth = 8; for (uint32_t i = 0; i < 2; ++i) { //SIMD16 split to two SIMD8 sel.curr.quarterControl = (i == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2; uint32_t msgid = 1; for (uint32_t dim = 0; dim < dims; ++dim) { //the coords GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType()); sel.MOV(GenRegister::retype(msgs[msgid++], coord.type), GenRegister::Qn(coord, i)); } while (msgid < 5) //fill fake coords sel.MOV(msgs[msgid++], GenRegister::immud(0)); for (uint32_t j = 0; j < 4; ++j) { //the data GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType()); sel.MOV(GenRegister::retype(msgs[msgid++], data.type), GenRegister::Qn(data, i)); } sel.TYPED_WRITE(msgs, 9, bti, dims == 3); } sel.pop(); return true; } INLINE bool emitWithSimd8(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const { using namespace ir; GenRegister msgs[9]; // (header + U + V + W + LOD + 4) msgs[0] = header; const uint32_t dims = insn.getSrcNum() - 4; uint32_t bti = insn.getImageIndex(); uint32_t msgid = 1; for (uint32_t dim = 0; dim < dims; ++dim) { //the coords GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType()); msgs[msgid++] = coord; } while (msgid < 5) { //fill fake coords GenRegister fake = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); sel.MOV(fake, GenRegister::immud(0)); msgs[msgid++] = fake; } for (uint32_t j = 0; j < 4; ++j) { //the data GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType()); msgs[msgid++] = data; } sel.TYPED_WRITE(msgs, 9, bti, dims == 3); return true; } DECL_CTOR(TypedWriteInstruction, 1, 1); }; /*! get image info instruction pattern. */ DECL_PATTERN(GetImageInfoInstruction) { INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn, bool &markChildren) const { using namespace ir; GenRegister dst; dst = sel.selReg(insn.getDst(0), TYPE_U32); GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0)); sel.MOV(dst, imageInfoReg); return true; } DECL_CTOR(GetImageInfoInstruction, 1, 1); }; class ReadARFInstructionPattern : public SelectionPattern { public: ReadARFInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_READ_ARF); } INLINE uint32_t getRegNum(ir::ARFRegister arf) const { if (arf == ir::ARF_TM) { return 0xc0; } else { GBE_ASSERT(0); return 0; } } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::ReadARFInstruction &insn = cast(dag.insn); GenRegister dst; dst = sel.selReg(insn.getDst(0), insn.getType()); sel.push(); sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.curr.execWidth = 8; sel.READ_ARF(dst, GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, getRegNum(insn.getARFRegister()), 0, getGenType(insn.getType()), GEN_VERTICAL_STRIDE_8, GEN_WIDTH_8, GEN_HORIZONTAL_STRIDE_1)); sel.pop(); return true; } }; class SimdShuffleInstructionPattern : public SelectionPattern { public: SimdShuffleInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_SIMD_SHUFFLE); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::SimdShuffleInstruction &insn = cast(dag.insn); assert(insn.getOpcode() == OP_SIMD_SHUFFLE); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src0 = sel.selReg(insn.getSrc(0), type); GenRegister src1; SelectionDAG *dag0 = dag.child[0]; SelectionDAG *dag1 = dag.child[1]; if (dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) { const auto &childInsn = cast(dag1->insn); src1 = getRegisterFromImmediate(childInsn.getImmediate(), TYPE_U32); if (dag0) dag0->isRoot = 1; } else { markAllChildren(dag); src1 = sel.selReg(insn.getSrc(1), TYPE_U32); } sel.push(); if (sel.isScalarReg(insn.getSrc(0))) { if (sel.isScalarReg(insn.getDst(0))) { sel.curr.execWidth = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; } sel.MOV(dst, src0); //no matter what src1 is } else { if (src1.file == GEN_IMMEDIATE_VALUE) { uint32_t offset = src1.value.ud % sel.curr.execWidth; GenRegister reg = GenRegister::subphysicaloffset(src0, offset); reg.vstride = GEN_VERTICAL_STRIDE_0; reg.hstride = GEN_HORIZONTAL_STRIDE_0; reg.width = GEN_WIDTH_1; sel.MOV(dst, reg); } else { GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); uint32_t SHLimm = typeSize(getGenType(type)) == 2 ? 1 : (typeSize(getGenType(type)) == 4 ? 2 : 3); sel.SHL(shiftL, src1, GenRegister::immud(SHLimm)); sel.SIMD_SHUFFLE(dst, src0, shiftL); } } sel.pop(); return true; } }; /*! Get a region of a register */ class RegionInstructionPattern : public SelectionPattern { public: RegionInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_REGION); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::RegionInstruction &insn = cast(dag.insn); GenRegister dst, src; dst = sel.selReg(insn.getDst(0), ir::TYPE_U32); src = GenRegister::ud1grf(insn.getSrc(0)); src = sel.getOffsetReg(src, 0, insn.getOffset()*4); sel.push(); sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.MOV(dst, src); sel.pop(); markAllChildren(dag); return true; } }; /*! Get a region of a register */ class IndirectMovInstructionPattern : public SelectionPattern { public: IndirectMovInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_INDIRECT_MOV); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::IndirectMovInstruction &insn = cast(dag.insn); GenRegister dst, src0, src1; uint32_t offset = insn.getOffset(); dst = sel.selReg(insn.getDst(0), insn.getType()); src0 = sel.selReg(insn.getSrc(0), TYPE_U32); src1 = sel.selReg(insn.getSrc(1), TYPE_U32); GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U16); sel.INDIRECT_MOVE(dst, tmp, src0, src1, offset); markAllChildren(dag); return true; } }; class CalcTimestampInstructionPattern : public SelectionPattern { public: CalcTimestampInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_CALC_TIMESTAMP); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::CalcTimestampInstruction &insn = cast(dag.insn); uint32_t pointNum = insn.getPointNum(); uint32_t tsType = insn.getTimestamptType(); GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8); GenRegister tmp; GenRegister ts[5]; int tsNum; if (sel.ctx.getSimdWidth() == 16) { if (!sel.hasLongType()) tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD)), GEN_TYPE_UD); ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD); ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD); ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UW); tsNum = 3; } else { if (!sel.hasLongType()) tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD); ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD); ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UD); ts[3] = GenRegister::retype(sel.selReg(ir::ocl::profilingts3, ir::TYPE_U32), GEN_TYPE_UD); ts[4] = GenRegister::retype(sel.selReg(ir::ocl::profilingts4, ir::TYPE_U32), GEN_TYPE_UD); tsNum = 5; } sel.push(); { sel.curr.flag = 0; sel.curr.subFlag = 1; sel.CALC_TIMESTAMP(ts, tsNum, tmp, pointNum, tsType); } sel.pop(); markAllChildren(dag); return true; } }; class StoreProfilingInstructionPattern : public SelectionPattern { public: StoreProfilingInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_STORE_PROFILING); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::StoreProfilingInstruction &insn = cast(dag.insn); uint32_t profilingType = insn.getProfilingType(); uint32_t BTI = insn.getBTI(); GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8); GenRegister tmp0; GenRegister tmp1; GenRegister ts[5]; int tsNum; if (sel.ctx.getSimdWidth() == 16) { tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD); ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD); ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UW); tsNum = 3; } else { tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD); ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD); ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UD); ts[3] = GenRegister::retype(sel.selReg(ir::ocl::profilingts3, ir::TYPE_U32), GEN_TYPE_UD); ts[4] = GenRegister::retype(sel.selReg(ir::ocl::profilingts4, ir::TYPE_U32), GEN_TYPE_UD); tsNum = 5; } sel.push(); { sel.curr.flag = 0; sel.curr.subFlag = 1; sel.STORE_PROFILING(profilingType, BTI, tmp0, tmp1, ts, tsNum); } sel.pop(); markAllChildren(dag); return true; } }; class PrintfInstructionPattern : public SelectionPattern { public: PrintfInstructionPattern(void) : SelectionPattern(1,1) { this->opcodes.push_back(ir::OP_PRINTF); } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::PrintfInstruction &insn = cast(dag.insn); uint16_t num = insn.getNum(); uint8_t BTI = insn.getBti(); GenRegister tmp0, tmp1; uint32_t srcNum = insn.getSrcNum(); uint32_t i = 0; uint32_t totalSize = 0; bool isContinue = false; GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8); tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); /* Get the total size for one printf statement. */ for (i = 0; i < srcNum; i++) { Type type = insn.getType(i); if (type == TYPE_DOUBLE || type == TYPE_S64 || type == TYPE_U64) { totalSize += 8; } else { totalSize += 4; // Make sure always align to 4. } } i = 0; GenRegister regs[8]; if (srcNum == 0) { sel.PRINTF(BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize); } else { do { uint32_t s = srcNum < 8 ? srcNum : 8; for (uint32_t j = 0; j < s; j++) { regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i + j)); } sel.PRINTF(BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize); if (srcNum > 8) { srcNum -= 8; i += 8; } else { srcNum = 0; } isContinue = true; } while(srcNum); } markAllChildren(dag); return true; } }; /*! Branch instruction pattern */ class BranchInstructionPattern : public SelectionPattern { public: BranchInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } void emitForwardBranch(Selection::Opaque &sel, const ir::BranchInstruction &insn, ir::LabelIndex dst, ir::LabelIndex src) const { using namespace ir; const GenRegister ip = sel.getBlockIP(); // We will not emit any jump if we must go the next block anyway const BasicBlock *curr = insn.getParent(); const BasicBlock *next = curr->getNextBlock(); const LabelIndex nextLabel = next->getLabelIndex(); if (insn.isPredicated() == true) { const Register pred = insn.getPredicateIndex(); sel.push(); // we don't need to set next label to the pcip // as if there is no backward jump latter, then obviously everything will work fine. // If there is backward jump latter, then all the pcip will be updated correctly there. sel.curr.physicalFlag = 0; sel.curr.flagIndex = pred.value(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.setBlockIP(ip, dst.value()); sel.curr.predicate = GEN_PREDICATE_NONE; if (!sel.block->hasBarrier) sel.ENDIF(GenRegister::immd(0), nextLabel); sel.block->endifOffset = -1; sel.pop(); } else { // Update the PcIPs const LabelIndex jip = sel.ctx.getLabelIndex(&insn); sel.curr.flag = 0; sel.curr.subFlag = 1; if(insn.getParent()->needEndif) sel.setBlockIP(ip, dst.value()); if (!sel.block->hasBarrier) { if(insn.getParent()->needEndif && !insn.getParent()->needIf) sel.ENDIF(GenRegister::immd(0), insn.getParent()->endifLabel, insn.getParent()->endifLabel); else if(insn.getParent()->needEndif) sel.ENDIF(GenRegister::immd(0), nextLabel); } sel.block->endifOffset = -1; if (nextLabel == jip) return; // Branch to the jump target sel.push(); sel.curr.execWidth = 1; sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; // Actually, the origin of this JMPI should be the beginning of next BB. sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, ir::LabelIndex(curr->getLabelIndex().value() + 1)); sel.pop(); } } void emitBackwardBranch(Selection::Opaque &sel, const ir::BranchInstruction &insn, ir::LabelIndex dst, ir::LabelIndex src) const { using namespace ir; //const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16); const GenRegister ip = sel.getBlockIP(); const Function &fn = sel.ctx.getFunction(); const BasicBlock &bb = fn.getBlock(src); const LabelIndex jip = sel.ctx.getLabelIndex(&insn); const LabelIndex label = bb.getLabelIndex(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); GBE_ASSERT(bb.getNextBlock() != NULL); if (insn.isPredicated() == true) { const Register pred = insn.getPredicateIndex(); // Update the PcIPs for all the branches. Just put the IPs of the next // block. Next instruction will properly update the IPs of the lanes // that actually take the branch const LabelIndex next = bb.getNextBlock()->getLabelIndex(); sel.setBlockIP(ip, next.value()); GBE_ASSERT(jip == dst); sel.push(); sel.curr.physicalFlag = 0; sel.curr.flagIndex = pred.value(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.setBlockIP(ip, dst.value()); sel.block->endifOffset = -1; sel.curr.predicate = GEN_PREDICATE_NONE; if (!sel.block->hasBarrier) sel.ENDIF(GenRegister::immd(0), next); sel.curr.execWidth = 1; if (simdWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; else sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; sel.curr.noMask = 1; sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label); sel.pop(); } else { const LabelIndex next = bb.getNextBlock()->getLabelIndex(); // Update the PcIPs sel.curr.flag = 0; sel.curr.subFlag = 1; if(insn.getParent()->needEndif) sel.setBlockIP(ip, dst.value()); sel.block->endifOffset = -1; if (!sel.block->hasBarrier) { if(insn.getParent()->needEndif && !insn.getParent()->needIf) sel.ENDIF(GenRegister::immd(0), insn.getParent()->endifLabel, insn.getParent()->endifLabel); else if(insn.getParent()->needEndif) sel.ENDIF(GenRegister::immd(0), next); } // Branch to the jump target sel.push(); sel.curr.execWidth = 1; sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label); sel.pop(); } } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::BranchInstruction &insn = cast(dag.insn); const Opcode opcode = insn.getOpcode(); if (opcode == OP_RET) sel.EOT(); else if (opcode == OP_BRA) { const LabelIndex dst = insn.getLabelIndex(); const LabelIndex src = insn.getParent()->getLabelIndex(); sel.push(); if (insn.isPredicated() == true) { if (dag.child[0] == NULL) sel.curr.externFlag = 1; } // We handle foward and backward branches differently if (uint32_t(dst) <= uint32_t(src)) this->emitBackwardBranch(sel, insn, dst, src); else this->emitForwardBranch(sel, insn, dst, src); sel.pop(); } else if(opcode == OP_IF) { const Register pred = insn.getPredicateIndex(); const LabelIndex jip = insn.getLabelIndex(); LabelIndex uip; if(insn.getParent()->matchingEndifLabel != 0) uip = insn.getParent()->matchingEndifLabel; else uip = jip; sel.push(); sel.curr.physicalFlag = 0; sel.curr.flagIndex = (uint64_t)pred; sel.curr.externFlag = 1; sel.curr.inversePredicate = insn.getInversePredicated(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.IF(GenRegister::immd(0), jip, uip); sel.curr.inversePredicate = 0; sel.pop(); } else if(opcode == OP_ENDIF) { const LabelIndex label = insn.getLabelIndex(); sel.push(); sel.curr.noMask = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.ENDIF(GenRegister::immd(0), label, label); sel.pop(); } else if(opcode == OP_ELSE) { const LabelIndex label = insn.getLabelIndex(); sel.ELSE(GenRegister::immd(0), label, insn.getParent()->thisElseLabel); } else if(opcode == OP_WHILE) { const Register pred = insn.getPredicateIndex(); const LabelIndex jip = insn.getLabelIndex(); sel.push(); sel.curr.physicalFlag = 0; sel.curr.flagIndex = (uint64_t)pred; sel.curr.externFlag = 1; sel.curr.inversePredicate = insn.getInversePredicated(); sel.curr.predicate = GEN_PREDICATE_NORMAL; sel.WHILE(GenRegister::immd(0), jip); sel.curr.inversePredicate = 0; sel.pop(); } else NOT_IMPLEMENTED; markAllChildren(dag); return true; } }; /*! WorkGroup instruction pattern */ DECL_PATTERN(WorkGroupInstruction) { /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE * Shared local memory bassed communication between threads, * prepare for the workgroup op in gen context * Algorithm logic is in gen context, */ INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const { using namespace ir; GBE_ASSERT(insn.getSrcNum() == 3); GBE_ASSERT(insn.getSrc(0) == ocl::threadn); GBE_ASSERT(insn.getSrc(1) == ocl::threadid); const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src = sel.selReg(insn.getSrc(2), type); GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32); GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32); GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32); GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD)); /* Allocate registers for message sending * (read/write to shared local memory), * only one data (ud/ul) is needed for thread communication, * we will always use SIMD8 to do the read/write */ vector msg; msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //address msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //data if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG))); //data /* Insert a barrier to make sure all the var we are interested in have been assigned the final value. */ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); /* Pass the shared local memory offset */ sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr())); /* Perform workgroup op */ sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1, localThreadID, localThreadNUM, tmpData2, slmOff, msg, localBarrier); return true; } /* WORKGROUP OP: BROADCAST * 1. BARRIER Ensure all the threads have set the correct value for the var which will be broadcasted. 2. CMP IDs Compare the local IDs with the specified ones in the function call. 3. STORE Use flag to control the store of the var. Only the specified item will execute the store. 4. BARRIER Ensure the specified value has been stored. 5. LOAD Load the stored value to all the dst value, the dst of all the items will have same value, so broadcasted. */ INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const { using namespace ir; const uint32_t srcNum = insn.getSrcNum(); GBE_ASSERT(srcNum >= 2); const Type type = insn.getType(); const GenRegister src = sel.selReg(insn.getSrc(0), type); const GenRegister dst = sel.selReg(insn.getDst(0), type); const uint32_t slmAddr = insn.getSlmAddr(); GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); vector fakeTemps; fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type)); fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type)); GenRegister coords[3]; for (uint32_t i = 1; i < srcNum; i++) coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD); sel.push(); { sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(addr, GenRegister::immud(slmAddr)); } sel.pop(); /* insert a barrier to make sure all the var we are interested in have been assigned the final value. */ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); sel.push(); { sel.curr.flag = 0; sel.curr.subFlag = 1; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; GenRegister lid0, lid1, lid2; uint32_t dim = srcNum - 1; lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD); lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD); lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD); sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); sel.curr.predicate = GEN_PREDICATE_NORMAL; if (dim >= 2) sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); if (dim >= 3) sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); /* write to shared local memory for BYTE/WORD/DWORD types */ if (typeSize(src.type) <= 4) { GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD); sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps); } /* write to shared local memory for QWORD types */ else if (typeSize(src.type) == 8) { sel.push(); { /* arrange data in QWORD */ GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); GenRegister srcQW = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64); GenRegister srcQW_p1 = src.retype(srcQW, GEN_TYPE_UD); GenRegister srcQW_p2 = src.retype(src.offset(srcQW, 2, 0), GEN_TYPE_UD); vector srcVec; srcVec.push_back(srcQW_p1); srcVec.push_back(srcQW_p2); /* unpack into 2 DWORD */ sel.UNPACK_LONG(srcQW, src); /* emit write through SEND */ sel.UNTYPED_WRITE(_addr, srcVec.data(), 2, GenRegister::immw(0xfe), fakeTemps); }sel.pop(); } else GBE_ASSERT(0); } sel.pop(); /* make sure the slm var have the valid value now */ sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); /* read from shared local memory for BYTE/WORD/DWORD types */ if (typeSize(src.type) <= 4) { GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD); sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps); } /* read from shared local memory for QWORD types */ else if (typeSize(src.type) == 8) { GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD); vector _dst; _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32)); _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32)); GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL); sel.push(); { /* emit read through SEND */ sel.curr.execWidth = 8; sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps); /* reconstruct QWORD type */ _dst[0] = dst.toUniform(dst.offset(_dst[0], 0, 4), GEN_TYPE_UD); _dst[1] = dst.toUniform(_dst[1], GEN_TYPE_UD); sel.curr.execWidth = 1; sel.MOV(_dst[0], _dst[1]); } sel.pop(); /* set all elements assigned to thread */ sel.MOV(dst, _dstQ); } else GBE_ASSERT(0); return true; } INLINE bool emitOne(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn, bool &markChildren) const { using namespace ir; const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); if (workGroupOp == WORKGROUP_OP_BROADCAST){ return emitWGBroadcast(sel, insn); } else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){ return emitWGReduce(sel, insn); } else GBE_ASSERT(0); return true; } DECL_CTOR(WorkGroupInstruction, 1, 1); }; /*! SubGroup instruction pattern */ class SubGroupInstructionPattern : public SelectionPattern { public: SubGroupInstructionPattern(void) : SelectionPattern(1,1) { for (uint32_t op = 0; op < ir::OP_INVALID; ++op) if (ir::isOpcodeFrom(ir::Opcode(op)) == true) this->opcodes.push_back(ir::Opcode(op)); } /* SUBGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE * Shared algorithm with workgroup inthread */ INLINE bool emitSGReduce(Selection::Opaque &sel, const ir::SubGroupInstruction &insn) const { using namespace ir; GBE_ASSERT(insn.getSrcNum() == 1); const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); const Type type = insn.getType(); GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src = sel.selReg(insn.getSrc(0), type); GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type); /* Perform workgroup op */ sel.SUBGROUP_OP(workGroupOp, dst, src, tmpData1, tmpData2); return true; } /* SUBROUP OP: BROADCAST * Shared algorithm with simd shuffle */ INLINE bool emitSGBroadcast(Selection::Opaque &sel, const ir::SubGroupInstruction &insn, SelectionDAG &dag) const { using namespace ir; GBE_ASSERT(insn.getSrcNum() == 2); const Type type = insn.getType(); const GenRegister src0 = sel.selReg(insn.getSrc(0), type); const GenRegister dst = sel.selReg(insn.getDst(0), type); GenRegister src1; SelectionDAG *dag0 = dag.child[0]; SelectionDAG *dag1 = dag.child[1]; if (dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) { const auto &childInsn = cast(dag1->insn); src1 = getRegisterFromImmediate(childInsn.getImmediate(), TYPE_U32); if (dag0) dag0->isRoot = 1; } else { markAllChildren(dag); src1 = sel.selReg(insn.getSrc(1), TYPE_U32); } sel.push(); { if (src1.file == GEN_IMMEDIATE_VALUE) { uint32_t offset = src1.value.ud % sel.curr.execWidth; GenRegister reg = GenRegister::subphysicaloffset(src0, offset); reg.vstride = GEN_VERTICAL_STRIDE_0; reg.hstride = GEN_HORIZONTAL_STRIDE_0; reg.width = GEN_WIDTH_1; sel.MOV(dst, reg); } else { GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); uint32_t SHLimm = typeSize(getGenType(type)) == 2 ? 1 : (typeSize(getGenType(type)) == 4 ? 2 : 3); sel.SHL(shiftL, src1, GenRegister::immud(SHLimm)); sel.SIMD_SHUFFLE(dst, src0, shiftL); } } sel.pop(); return true; } INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; const ir::SubGroupInstruction &insn = cast(dag.insn); const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode(); if (workGroupOp == WORKGROUP_OP_BROADCAST){ return emitSGBroadcast(sel, insn, dag); } else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){ if(emitSGReduce(sel, insn)) markAllChildren(dag); else return false; } else GBE_ASSERT(0); return true; } }; static uint32_t fixBlockSize(uint8_t width, uint8_t height, uint32_t vec_size, uint32_t typeSize, uint32_t simdWidth, uint32_t &block_width) { uint32_t blocksize = 0; if (width && height) { if (width * height * typeSize > vec_size * simdWidth * typeSize) { if (width <= simdWidth * vec_size) { height = vec_size * simdWidth / width; } else { height = 1; width = vec_size * simdWidth / height; } } } else { width = simdWidth; height = vec_size; } block_width = typeSize * (width < simdWidth ? width : simdWidth); blocksize = (block_width - 1) % 32 | (height - 1) << 16; return blocksize; } /*! Media Block Read pattern */ DECL_PATTERN(MediaBlockReadInstruction) { bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction &insn, bool &markChildren) const { using namespace ir; uint32_t vec_size = insn.getVectorSize(); uint32_t simdWidth = sel.curr.execWidth; const Type type = insn.getType(); uint32_t typeSize = 0; if(type == TYPE_U32) { typeSize = 4; }else if(type == TYPE_U16) { typeSize = 2; }else if(type == TYPE_U8) { typeSize = 1; }else NOT_IMPLEMENTED; uint32_t response_size = simdWidth * vec_size * typeSize / 32; // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1 response_size = response_size ? response_size : 1; uint32_t block_width = 0; uint32_t blocksize = fixBlockSize(insn.getWidth(), insn.getHeight(), vec_size, typeSize, simdWidth, block_width); vector valuesVec; vector tmpVec; for (uint32_t i = 0; i < vec_size; ++i) { valuesVec.push_back(sel.selReg(insn.getDst(i), type)); if((simdWidth == 16 && typeSize == 4) || typeSize == 1) tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); } const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD); const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD); const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD); const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD); const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4); // Make header sel.push(); // Copy r0 into the header first sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(header, GenRegister::ud8grf(0, 0)); // Update the header with the coord sel.curr.execWidth = 1; sel.MOV(offsetx, coordx); sel.MOV(offsety, coordy); // Update block width and height sel.MOV(blocksizereg, GenRegister::immud(blocksize)); sel.pop(); if (block_width < 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; // Now read the data if(typeSize == 1) { sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), response_size); for (uint32_t i = 0; i < vec_size; i++) { sel.MOV(valuesVec[i], sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth)); sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 16), sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8)); } }else sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size); sel.pop(); } else if (block_width == 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size); for (uint32_t i = 0; i < vec_size; i++) sel.MOV(valuesVec[i], tmpVec[i]); // Second half // Update the header with the coord sel.curr.execWidth = 1; sel.ADD(offsetx, offsetx, GenRegister::immud(32)); // Now read the data sel.curr.execWidth = 8; sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size); // Move the reg to fit vector rule. for (uint32_t i = 0; i < vec_size; i++) sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]); sel.pop(); } else NOT_IMPLEMENTED; return true; } DECL_CTOR(MediaBlockReadInstruction, 1, 1); }; /*! Media Block Write pattern */ DECL_PATTERN(MediaBlockWriteInstruction) { bool emitOne(Selection::Opaque &sel, const ir::MediaBlockWriteInstruction &insn, bool &markChildren) const { using namespace ir; uint32_t vec_size = insn.getVectorSize(); const Type type = insn.getType(); uint32_t simdWidth = sel.curr.execWidth; const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; const RegisterFamily family = getFamily(type); uint32_t typeSize = 0; if(type == TYPE_U32) { typeSize = 4; }else if(type == TYPE_U16) { typeSize = 2; }else if(type == TYPE_U8) { typeSize = 1; }else NOT_IMPLEMENTED; // ushort in simd8 will have half reg, but data lenght is still 1 uint32_t data_size = simdWidth * vec_size * typeSize / 32; data_size = data_size? data_size : 1; uint32_t block_width = 0; uint32_t blocksize = fixBlockSize(insn.getWidth(), insn.getHeight(), vec_size, typeSize, simdWidth, block_width); vector valuesVec; vector tmpVec; for (uint32_t i = 0; i < vec_size; ++i) { valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); if(simdWidth == 16 && typeSize == 4) tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); else tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType)); } const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD); const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD); const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD); const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD); const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4); // Make header sel.push(); // Copy r0 into the header first sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; sel.MOV(header, GenRegister::ud8grf(0, 0)); // Update the header with the coord sel.curr.execWidth = 1; sel.MOV(offsetx, coordx); sel.MOV(offsety, coordy); // Update block width and height sel.MOV(blocksizereg, GenRegister::immud(blocksize)); sel.pop(); if (block_width < 64) { for (uint32_t i = 0; i < vec_size; ++i) { sel.MOV(tmpVec[i], valuesVec[i]); } sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; // Now write the data if(typeSize == 1) { for (uint32_t i = 0; i < vec_size; i++) { sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth), valuesVec[i]); sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8), sel.getOffsetReg(valuesVec[i], 0, 16) ); } sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size); } else sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size); sel.pop(); } else if (block_width == 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; for (uint32_t i = 0; i < vec_size; i++) sel.MOV(tmpVec[i], valuesVec[i]); sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size); // Second half // Update the header with the coord sel.curr.execWidth = 1; sel.ADD(offsetx, offsetx, GenRegister::immud(32)); sel.curr.execWidth = 8; for (uint32_t i = 0; i < vec_size; i++) sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32)); // Now write the data sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size); // Move the reg to fit vector rule. sel.pop(); } else NOT_IMPLEMENTED; return true; } DECL_CTOR(MediaBlockWriteInstruction, 1, 1); }; /*! Sort patterns */ INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) { if (p0->insnNum != p1->insnNum) return p0->insnNum > p1->insnNum; return p0->cost < p1->cost; } SelectionLibrary::SelectionLibrary(void) { this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); this->insert(); // Sort all the patterns with the number of instructions they output for (uint32_t op = 0; op < ir::OP_INVALID; ++op) std::sort(this->patterns[op].begin(), this->patterns[op].end(), cmp); } SelectionLibrary::~SelectionLibrary(void) { for (auto pattern : this->toFree) GBE_DELETE(const_cast(pattern)); } template void SelectionLibrary::insert(void) { const SelectionPattern *pattern = GBE_NEW_NO_ARG(PatternType); this->toFree.push_back(pattern); for (auto opcode : pattern->opcodes) this->patterns[opcode].push_back(pattern); } } /* namespace gbe */