/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Ruiling, Song
*
* The Idea is that: As GEN support at most 4 successive DWORD load/store,
* then merge successive load/store that are compatible is beneficial.
* The method of checking whether two load/store is compatible are borrowed
* from Vectorize passes in llvm.
*/
#include "llvm_includes.hpp"
using namespace llvm;
namespace gbe {
class GenLoadStoreOptimization : public BasicBlockPass {
public:
static char ID;
ScalarEvolution *SE;
const DataLayout *TD;
GenLoadStoreOptimization() : BasicBlockPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const {
#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
AU.addRequired();
AU.addPreserved();
#else
AU.addRequired();
AU.addPreserved();
#endif
AU.setPreservesCFG();
}
virtual bool runOnBasicBlock(BasicBlock &BB) {
#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 38
SE = &getAnalysis().getSE();
#else
SE = &getAnalysis();
#endif
#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 37
TD = &BB.getModule()->getDataLayout();
#elif LLVM_VERSION_MINOR >= 5
DataLayoutPass *DLP = getAnalysisIfAvailable();
TD = DLP ? &DLP->getDataLayout() : nullptr;
#else
TD = getAnalysisIfAvailable();
#endif
return optimizeLoadStore(BB);
}
Type *getValueType(Value *insn);
Value *getPointerOperand(Value *I);
unsigned getAddressSpace(Value *I);
bool isSimpleLoadStore(Value *I);
bool optimizeLoadStore(BasicBlock &BB);
bool isLoadStoreCompatible(Value *A, Value *B, int *dist, int* elementSize, int maxVecSize);
void mergeLoad(BasicBlock &BB, SmallVector &merged, Instruction *start,int offset);
void mergeStore(BasicBlock &BB, SmallVector &merged, Instruction *start,int offset);
bool findConsecutiveAccess(BasicBlock &BB,
SmallVector &merged,
const BasicBlock::iterator &start,
unsigned maxVecSize,
bool isLoad,
int *addrOffset);
#if LLVM_VERSION_MAJOR * 10 + LLVM_VERSION_MINOR >= 40
virtual StringRef getPassName() const
#else
virtual const char *getPassName() const
#endif
{
return "Merge compatible Load/stores for Gen";
}
};
char GenLoadStoreOptimization::ID = 0;
Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
if (LoadInst *LI = dyn_cast(I)) return LI->getPointerOperand();
if (StoreInst *SI = dyn_cast(I)) return SI->getPointerOperand();
return NULL;
}
unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
if (LoadInst *L=dyn_cast(I)) return L->getPointerAddressSpace();
if (StoreInst *S=dyn_cast(I)) return S->getPointerAddressSpace();
return -1;
}
bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
if (LoadInst *L=dyn_cast(I)) return L->isSimple();
if (StoreInst *S=dyn_cast(I)) return S->isSimple();
return false;
}
Type *GenLoadStoreOptimization::getValueType(Value *insn) {
if(LoadInst *ld = dyn_cast(insn)) return ld->getType();
if(StoreInst *st = dyn_cast(insn)) return st->getValueOperand()->getType();
return NULL;
}
bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B, int *dist, int* elementSize, int maxVecSize) {
Value *ptrA = getPointerOperand(A);
Value *ptrB = getPointerOperand(B);
unsigned ASA = getAddressSpace(A);
unsigned ASB = getAddressSpace(B);
// Check that the address spaces match and that the pointers are valid.
if (!ptrA || !ptrB || (ASA != ASB)) return false;
if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
// Check that A and B are of the same type.
if (ptrA->getType() != ptrB->getType()) return false;
// Calculate the distance.
const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
const SCEVConstant *constOffSCEV = dyn_cast(offsetSCEV);
// Non constant distance.
if (!constOffSCEV) return false;
int64_t offset = constOffSCEV->getValue()->getSExtValue();
Type *Ty = cast(ptrA->getType())->getElementType();
// The Instructions are connsecutive if the size of the first load/store is
// the same as the offset.
int64_t sz = TD->getTypeStoreSize(Ty);
*dist = -offset;
*elementSize = sz;
//a insn with small distance from the search load/store is a candidate one
return (abs(-offset) < sz*maxVecSize);
}
void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB,
SmallVector &merged,
Instruction *start,
int offset) {
IRBuilder<> Builder(&BB);
unsigned size = merged.size();
SmallVector values;
for(unsigned i = 0; i < size; i++) {
values.push_back(merged[i]);
}
LoadInst *ld = cast(start);
unsigned align = ld->getAlignment();
unsigned addrSpace = ld->getPointerAddressSpace();
// insert before first load
Builder.SetInsertPoint(ld);
//modify offset
Value *newPtr = ld->getPointerOperand();
if(offset != 0)
{
Type *ptype = ld->getPointerOperand()->getType();
unsigned typeSize = TD->getPointerTypeSize(ptype);
ptype = (typeSize == 4) ? Builder.getInt32Ty():Builder.getInt64Ty();
Value *StartAddr = Builder.CreatePtrToInt(ld->getPointerOperand(), ptype);
Value *offsetVal = ConstantInt::get(ptype, offset);
Value *newAddr = Builder.CreateAdd(StartAddr, offsetVal);
newPtr = Builder.CreateIntToPtr(newAddr, ld->getPointerOperand()->getType());
}
VectorType *vecTy = VectorType::get(ld->getType(), size);
Value *vecPtr = Builder.CreateBitCast(newPtr, PointerType::get(vecTy, addrSpace));
LoadInst *vecValue = Builder.CreateLoad(vecPtr);
vecValue->setAlignment(align);
for (unsigned i = 0; i < size; ++i) {
Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
values[i]->replaceAllUsesWith(S);
}
}
class mergedInfo{
public:
Instruction* mInsn;
int mOffset;
void init(Instruction* insn, int offset)
{
mInsn = insn;
mOffset = offset;
}
};
struct offsetSorter {
bool operator()(mergedInfo* m0, mergedInfo* m1) const {
return m0->mOffset < m1->mOffset;
}
};
// When searching for consecutive memory access, we do it in a small window,
// if the window is too large, it would take up too much compiling time.
// An Important rule we have followed is don't try to change load/store order.
// But an exeption is 'load& store that are from different address spaces. The
// return value will indicate wheter such kind of reorder happens.
bool
GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
SmallVector &merged,
const BasicBlock::iterator &start,
unsigned maxVecSize,
bool isLoad,
int *addrOffset) {
if(!isSimpleLoadStore(&*start)) return false;
unsigned targetAddrSpace = getAddressSpace(&*start);
BasicBlock::iterator E = BB.end();
BasicBlock::iterator J = start;
++J;
unsigned maxLimit = maxVecSize * 8;
bool crossAddressSpace = false;
// When we are merging loads and there are some other AddressSpace stores
// lies among them, we are saying that loadStoreReorder happens. The same
// for merging stores and there are some other AddressSpace load among them.
bool loadStoreReorder = false;
bool ready = false;
int elementSize;
SmallVector searchInsnArray;
mergedInfo meInfoArray[32];
int indx = 0;
meInfoArray[indx++].init(&*start, 0);
searchInsnArray.push_back(&meInfoArray[0]);
for(unsigned ss = 0; J!= E && ss <= maxLimit; ++ss, ++J) {
if((isLoad && isa(*J)) || (!isLoad && isa(*J))) {
int distance;
if(isLoadStoreCompatible(searchInsnArray[0]->mInsn, &*J, &distance, &elementSize, maxVecSize))
{
meInfoArray[indx].init(&*J, distance);
searchInsnArray.push_back(&meInfoArray[indx]);
indx++;
if (crossAddressSpace)
loadStoreReorder = true;
if(indx >= 32)
break;
}
} else if((isLoad && isa(*J))) {
// simple stop to keep read/write order
StoreInst *st = cast(&*J);
unsigned addrSpace = st->getPointerAddressSpace();
if (addrSpace != targetAddrSpace) {
crossAddressSpace = true;
} else {
break;
}
} else if ((!isLoad && isa(*J))) {
LoadInst *ld = cast(&*J);
unsigned addrSpace = ld->getPointerAddressSpace();
if (addrSpace != targetAddrSpace) {
crossAddressSpace = true;
} else {
break;
}
}
}
if(indx > 1)
{
//try to sort the load/store by the offset from the start
//the farthest is at the beginning. this is easy to find the
//continuous load/store
std::sort(searchInsnArray.begin(), searchInsnArray.end(), offsetSorter());
// try to find continuous loadstore insn in the candidate array
for(unsigned i = 0; i < searchInsnArray.size(); i++)
{
unsigned j;
for(j = 0; j < maxVecSize-1 && (j+i+1) < searchInsnArray.size(); j++)
{
if(searchInsnArray[i+j+1]->mOffset - searchInsnArray[i+j]->mOffset != elementSize)
break;
//this means the search load/store which offset is 0, is in the sequence
if(searchInsnArray[i+j]->mOffset == 0 || searchInsnArray[i+j+1]->mOffset == 0)
ready = true;
}
if(j > 0 && ready)
{
unsigned endIndx = j + 1;
*addrOffset = searchInsnArray[i]->mOffset;
endIndx = (endIndx >= 16) ? 16 : (endIndx >= 8 ? 8 : (endIndx >= 4 ? 4 : endIndx));
for(unsigned k = 0; k < endIndx; k++)
{
merged.push_back(searchInsnArray[i+k]->mInsn);
if (k >= maxVecSize)
break;
}
break;
}
}
}
return loadStoreReorder;
}
void GenLoadStoreOptimization::mergeStore(BasicBlock &BB,
SmallVector &merged,
Instruction *start,
int offset) {
IRBuilder<> Builder(&BB);
unsigned size = merged.size();
SmallVector values;
for(unsigned i = 0; i < size; i++) {
values.push_back(cast(merged[i])->getValueOperand());
}
StoreInst *st = cast(start);
if(!st)
return;
unsigned addrSpace = st->getPointerAddressSpace();
unsigned align = st->getAlignment();
// insert before the last store
Builder.SetInsertPoint(merged[size-1]);
Type *dataTy = st->getValueOperand()->getType();
VectorType *vecTy = VectorType::get(dataTy, size);
Value * parent = UndefValue::get(vecTy);
for(unsigned i = 0; i < size; i++) {
parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
}
Value * stPointer = st->getPointerOperand();
if(!stPointer)
return;
//modify offset
Value *newSPtr = stPointer;
if(offset != 0)
{
unsigned typeSize = TD->getPointerTypeSize(stPointer->getType());
Type *ptype = (typeSize == 4) ? Builder.getInt32Ty() : Builder.getInt64Ty();
Value *StartAddr = Builder.CreatePtrToInt(stPointer, ptype);
Value *offsetVal = ConstantInt::get(ptype, offset);
Value *newAddr = Builder.CreateAdd(StartAddr, offsetVal);
newSPtr = Builder.CreateIntToPtr(newAddr, stPointer->getType());
}
Value *newPtr = Builder.CreateBitCast(newSPtr, PointerType::get(vecTy, addrSpace));
StoreInst *newST = Builder.CreateStore(parent, newPtr);
newST->setAlignment(align);
}
// Find the safe iterator (will not be deleted after the merge) we can
// point to. If reorder happens, we need to
// point to the instruction after the first of toBeDeleted. If no reorder,
// we are safe to point to the instruction after the last of toBeDeleted
static BasicBlock::iterator
findSafeInstruction(SmallVector &toBeDeleted,
const BasicBlock::iterator ¤t,
bool reorder) {
BasicBlock::iterator safe = current;
unsigned size = toBeDeleted.size();
if (reorder) {
BasicBlock *BB = &*current->getParent();
for (; safe != BB->end(); ++safe) {
if (std::find(toBeDeleted.begin(), toBeDeleted.end(), &*safe) ==
toBeDeleted.end())
break;
}
} else {
// TODO we should use the furthest instruction, so that the outer loop
// ends quicker.
safe = BasicBlock::iterator(toBeDeleted[size - 1]);
++safe;
}
return safe;
}
bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
bool changed = false;
SmallVector merged;
for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
if(isa(*BBI) || isa(*BBI)) {
bool isLoad = isa(*BBI) ? true: false;
Type *ty = getValueType(&*BBI);
if(!ty) continue;
if(ty->isVectorTy()) continue;
// TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
continue;
int addrOffset = 0;
unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
(ty->isIntegerTy(16) ? 8 : 16);
bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad, &addrOffset);
uint32_t size = merged.size();
uint32_t pos = 0;
bool doDeleting = size > 1;
BasicBlock::iterator startLS = BBI;
if (doDeleting) {
// choose next undeleted instruction
BBI = findSafeInstruction(merged, BBI, reorder);
}
while(size > 1) {
unsigned vecSize = (size >= 16) ? 16 :
(size >= 8 ? 8 :
(size >= 4 ? 4 : size));
SmallVector mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
if(isLoad)
mergeLoad(BB, mergedVec, &*startLS, addrOffset);
else
mergeStore(BB, mergedVec, &*startLS, addrOffset);
// remove merged insn
for(uint32_t i = 0; i < mergedVec.size(); i++)
mergedVec[i]->eraseFromParent();
changed = true;
pos += vecSize;
size -= vecSize;
}
if (doDeleting) {
//adjust the BBI back by one, as we would increase it in for loop
//don't do this if BBI points to the very first instruction.
if (BBI != BB.begin())
--BBI;
}
merged.clear();
}
}
return changed;
}
BasicBlockPass *createLoadStoreOptimizationPass() {
return new GenLoadStoreOptimization();
}
};