From 0b207a3765b49eb0e7a4c98b322c178e2e567275 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song@intel.com>
Date: Fri, 27 Feb 2015 14:33:08 +0800
Subject: GBE: Support unaligned load/store of dword/qword in GenIR.

Although opencl does not allow unaligned load/store of dword/qword,
LLVM still may generate such kind of instructions, especially
large integer load/store is legalized into load/store of qword with
possible unaligned address. The implementation is simple:
for store, bitcast d/q word to vector of bytes before writing out,
for load, load vector of bytes and then bitcast them to d/q word.

Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp | 76 +++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 7e8bb651..201944ad 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -649,6 +649,8 @@ namespace gbe
                   Value *llvmValue, const ir::Register ptr,
                   const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
                   bool dwAligned);
+    // handle load of dword/qword with unaligned address
+    void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
     private:
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -3667,6 +3669,67 @@ namespace gbe
     }
     GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
   }
+  // handle load of dword/qword with unaligned address
+  void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
+  {
+    Type *llvmType = llvmValues->getType();
+    const ir::Type type = getType(ctx, llvmType);
+    unsigned byteSize = getTypeByteSize(unit, llvmType);
+    const ir::Register ptr = this->getRegister(llvmPtr);
+
+    Type *elemType = llvmType;
+    unsigned elemNum = 1;
+    if (!isScalarType(llvmType)) {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      elemType = vectorType->getElementType();
+      elemNum = vectorType->getNumElements();
+    }
+
+    vector<ir::Register> tupleData;
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+      ir::Register reg;
+      if(regTranslator.isUndefConst(llvmValues, elemID)) {
+        Value *v = Constant::getNullValue(elemType);
+        reg = this->getRegister(v);
+      } else
+        reg = this->getRegister(llvmValues, elemID);
+
+      tupleData.push_back(reg);
+    }
+    const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
+
+    vector<ir::Register> byteTupleData;
+    for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
+      byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
+    }
+    const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
+
+    if (isLoad) {
+      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
+    } else {
+      ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
+      // FIXME: byte scatter does not handle correctly vector store, after fix that,
+      //        we can directly use on store instruction like:
+      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
+        const ir::Register reg = byteTupleData[elemID];
+        ir::Register addr;
+        if (elemID == 0)
+          addr = ptr;
+        else {
+          const ir::Register offset = ctx.reg(pointerFamily);
+          ir::ImmediateIndex immIndex;
+          immIndex = ctx.newImmediate(int32_t(elemID));
+          addr = ctx.reg(pointerFamily);
+          ctx.LOADI(ir::TYPE_S32, offset, immIndex);
+          ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
+        }
+       ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+      }
+    }
+  }
 
   extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
@@ -3682,6 +3745,19 @@ namespace gbe
     ir::BTI binding;
     gatherBTI(&I, binding);
 
+    Type *scalarType = llvmType;
+    if (!isScalarType(llvmType)) {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      scalarType = vectorType->getElementType();
+    }
+
+    if (!dwAligned
+       && (scalarType == IntegerType::get(I.getContext(), 64)
+          || scalarType == IntegerType::get(I.getContext(), 32))
+       ) {
+      emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
+      return;
+    }
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
-- 
cgit v1.2.1