diff options
author | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-07-13 07:21:00 +0000 |
---|---|---|
committer | Tobias Grosser <grosser@fim.uni-passau.de> | 2012-07-13 07:21:00 +0000 |
commit | b299d281810e336dd241a8bd4a0bf6aad6323fae (patch) | |
tree | eafb78c60049b735440384fbcbd2e534da8e6813 | |
parent | a9c373e49da7a0c07fbe4f33b913a5de846b6cd9 (diff) | |
download | llvm-b299d281810e336dd241a8bd4a0bf6aad6323fae.tar.gz |
Add preliminary implementation for GPGPU code generation.
Translate the selected parallel loop body into a ptx string and run it
with cuda driver API. We limit this preliminary implementation to
target the following special test cases:
- Support only 2-dimensional parallel loops with or without only one
innermost non-parallel loop.
- Support write memory access to only one array in a SCoP.
Contributed by: Yabin Hu <yabin.hwu@gmail.com>
llvm-svn: 160164
-rw-r--r-- | polly/include/polly/CodeGen/PTXGenerator.h | 193 | ||||
-rwxr-xr-x | polly/include/polly/ScopInfo.h | 3 | ||||
-rwxr-xr-x | polly/lib/CodeGen/CMakeLists.txt | 1 | ||||
-rw-r--r-- | polly/lib/CodeGen/CodeGeneration.cpp | 196 | ||||
-rw-r--r-- | polly/lib/CodeGen/PTXGenerator.cpp | 652 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/2d_innermost_parallel.c | 16 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll | 65 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c | 17 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll | 88 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop | 21 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu | 21 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop | 17 | ||||
-rw-r--r-- | polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu | 17 |
13 files changed, 1307 insertions, 0 deletions
diff --git a/polly/include/polly/CodeGen/PTXGenerator.h b/polly/include/polly/CodeGen/PTXGenerator.h new file mode 100644 index 000000000000..64cc739742c3 --- /dev/null +++ b/polly/include/polly/CodeGen/PTXGenerator.h @@ -0,0 +1,193 @@ +//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains functions to create GPGPU parallel loops as LLVM-IR. +// +//===----------------------------------------------------------------------===// +#ifndef POLLY_CODEGEN_PTXGENERATOR_H +#define POLLY_CODEGEN_PTXGENERATOR_H + +#include "llvm/IRBuilder.h" +#include "llvm/ADT/SetVector.h" + +#include <map> + +namespace llvm { + class Value; + class Pass; + class BasicBlock; +} + +namespace polly { +using namespace llvm; + +class PTXGenerator { +public: + typedef std::map<Value*, Value*> ValueToValueMapTy; + + PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple); + + /// @brief Create a GPGPU parallel loop. + /// + /// @param UsedValues A set of LLVM-IR Values that should be available to + /// the new loop body. + /// @param OriginalIVS The new values of the original induction variables. + /// @param VMap This map is filled by createParallelLoop(). It + /// maps the values in UsedValues to Values through which + /// their content is available within the loop body. + /// @param LoopBody A pointer to an iterator that is set to point to the + /// body of the created loop. It should be used to insert + /// instructions that form the actual loop body. + void startGeneration(SetVector<Value*> &UsedValues, + SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap, + BasicBlock::iterator *LoopBody); + + /// @brief Execute the post-operations to build a GPGPU parallel loop. + /// + void finishGeneration(Function *SubFunction); + + /// @brief Set the parameters for launching PTX kernel. + /// + /// @param GridW A value of the width of a GPU grid. + /// @param GridH A value of the height of a GPU grid. + /// @param BlockW A value of the width of a GPU block. + /// @param BlockH A value of the height of a GPU block. + void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) { + GridWidth = GridW; + GridHeight = GridH; + BlockWidth = BlockW; + BlockHeight = BlockH; + } + + /// @brief Set the size of the output array. + /// + /// This size is used to allocate memory on the device and the host. + /// + /// @param Bytes Output array size in bytes. + void setOutputBytes(unsigned Bytes) { + OutputBytes = Bytes; + } + +private: + IRBuilder<> &Builder; + Pass *P; + + /// @brief The target triple of the device. + const std::string &GPUTriple; + + /// @brief Parameters used for launching PTX kernel. + int GridWidth, GridHeight, BlockWidth, BlockHeight; + + /// @brief Size of the output array in bytes. + unsigned OutputBytes; + + /// @brief Polly's GPU data types. + StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy; + + void InitializeGPUDataTypes(); + IntegerType *getInt64Type(); // i64 + PointerType *getI8PtrType(); // char * + PointerType *getPtrI8PtrType(); // char ** + PointerType *getFloatPtrType(); // float * + PointerType *getGPUContextPtrType(); // %struct.PollyGPUContextT * + PointerType *getGPUModulePtrType(); // %struct.PollyGPUModuleT * + PointerType *getGPUDevicePtrType(); // %struct.PollyGPUDeviceT * + PointerType *getPtrGPUDevicePtrType(); // %struct.PollyGPUDevicePtrT * + PointerType *getGPUFunctionPtrType(); // %struct.PollyGPUFunctionT * + PointerType *getGPUEventPtrType(); // %struct.PollyGPUEventT * + + Module *getModule(); + + /// @brief Create the kernel string containing LLVM IR. + /// + /// @param SubFunction A pointer to the device code function. + /// @return A global string variable containing the LLVM IR codes + // of the SubFunction. + Value *createPTXKernelFunction(Function *SubFunction); + + /// @brief Get the entry name of the device kernel function. + /// + /// @param SubFunction A pointer to the device code function. + /// @return A global string variable containing the entry name of + /// the SubFunction. + Value *getPTXKernelEntryName(Function *SubFunction); + + void createCallInitDevice(Value *Context, Value *Device); + void createCallGetPTXModule(Value *Buffer, Value *Module); + void createCallGetPTXKernelEntry(Value *Entry, Value *Module, + Value *Kernel); + void createCallAllocateMemoryForHostAndDevice(Value *HostData, + Value *DeviceData, + Value *Size); + void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData, + Value *Size); + void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData, + Value *Size); + void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth, + Value *BlockHeight, Value *DeviceData); + void createCallLaunchKernel(Value *Kernel, Value *GridWidth, + Value *GridHeight); + void createCallStartTimerByCudaEvent(Value *StartEvent, + Value *StopEvent); + void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent, + Value *Timer); + void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData, + Value *Module, Value *Context, + Value *Kernel); + + /// @brief Create the CUDA subfunction. + /// + /// @param UsedValues A set of LLVM-IR Values that should be available to + /// the new loop body. + /// @param VMap This map that is filled by createSubfunction(). It + /// maps the values in UsedValues to Values through which + /// their content is available within the loop body. + /// @param OriginalIVS The new values of the original induction variables. + /// @param SubFunction The newly created SubFunction is returned here. + void createSubfunction(SetVector<Value*> &UsedValues, + SetVector<Value*> &OriginalIVS, + ValueToValueMapTy &VMap, + Function **SubFunction); + + /// @brief Create the definition of the CUDA subfunction. + /// + /// @param NumArgs The number of parameters of this subfunction. This is + /// usually set to the number of memory accesses which + /// will be copied from host to device. + Function *createSubfunctionDefinition(int NumArgs); + + /// @brief Extract all the ptx related subfunctions into a new module. + /// + /// @param M Current module. + /// @return The generated module containing only gpu related + /// subfunctions. + Module *extractPTXFunctionsFromModule(const Module *M); + + /// @brief Get the Value of CUDA block width. + Value *getCUDABlockWidth(); + + /// @brief Get the Value of CUDA block height. + Value *getCUDABlockHeight(); + + /// @brief Get the Value of CUDA Gird width. + Value *getCUDAGridWidth(); + + /// @brief Get the Value of CUDA grid height. + Value *getCUDAGridHeight(); + + /// @brief Get the Value of the bytes of the output array. + Value *getOutputArraySizeInBytes(); + + /// @brief Erase the ptx-related subfunctions and declarations. + /// + /// @param SubFunction A pointer to the device code function. + void eraseUnusedFunctions(Function *SubFunction); +}; +} // end namespace polly +#endif /* POLLY_CODEGEN_PTXGENERATOR_H */ diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h index d9c75e19acd0..10cc81d2a25a 100755 --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -125,6 +125,9 @@ public: /// @brief Is this a read memory access? bool isRead() const { return Type == MemoryAccess::Read; } + /// @brief Is this a write memory access? + bool isWrite() const { return Type == MemoryAccess::Write; } + isl_map *getAccessRelation() const; /// @brief Get an isl string representing this access function. diff --git a/polly/lib/CodeGen/CMakeLists.txt b/polly/lib/CodeGen/CMakeLists.txt index a7f0c86c9305..5b52c9090557 100755 --- a/polly/lib/CodeGen/CMakeLists.txt +++ b/polly/lib/CodeGen/CMakeLists.txt @@ -15,4 +15,5 @@ add_polly_library(PollyCodeGen ${ISL_CODEGEN_FILES} LoopGenerators.cpp Utils.cpp + PTXGenerator.cpp ) diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp index 9cb708f09ca3..ce3318223caf 100644 --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -31,6 +31,7 @@ #include "polly/CodeGen/CodeGeneration.h" #include "polly/CodeGen/BlockGenerators.h" #include "polly/CodeGen/LoopGenerators.h" +#include "polly/CodeGen/PTXGenerator.h" #include "polly/CodeGen/Utils.h" #include "polly/Support/GICHelper.h" @@ -66,6 +67,17 @@ OpenMP("enable-polly-openmp", cl::init(false), cl::ZeroOrMore); static cl::opt<bool> +GPGPU("enable-polly-gpgpu", + cl::desc("Generate GPU parallel code"), cl::Hidden, + cl::value_desc("GPGPU code generation enabled if true"), + cl::init(false), cl::ZeroOrMore); + +static cl::opt<std::string> +GPUTriple("polly-gpgpu-triple", + cl::desc("Target triple for GPU code generation"), + cl::Hidden, cl::init("")); + +static cl::opt<bool> AtLeastOnce("enable-polly-atLeastOnce", cl::desc("Give polly the hint, that every loop is executed at least" "once"), cl::Hidden, @@ -284,6 +296,25 @@ private: /// statement. void codegenForOpenMP(const clast_for *f); + /// @brief Create GPGPU device memory access values. + /// + /// Create a list of values that will be set to be parameters of the GPGPU + /// subfunction. These parameters represent device memory base addresses + /// and the size in bytes. + SetVector<Value*> getGPUValues(unsigned &OutputBytes); + + /// @brief Create a GPU parallel for loop. + /// + /// This loop reflects a loop as if it would have been created by a GPU + /// statement. + void codegenForGPGPU(const clast_for *F); + + /// @brief Get innermost statement for the transformed loops. + const clast_stmt *getScheduleInfo(const clast_for *F, + std::vector<int> &NumIters, + unsigned &LoopDepth, + unsigned &NonPLoopDepth); + /// @brief Check if a loop is parallel /// /// Detect if a clast_for loop can be executed in parallel. @@ -530,6 +561,161 @@ void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { Builder.SetInsertPoint(AfterLoop); } +static unsigned getArraySizeInBytes(const ArrayType *AT) { + unsigned Bytes = AT->getNumElements(); + if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) + Bytes *= getArraySizeInBytes(T); + else + Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; + + return Bytes; +} + +SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { + SetVector<Value*> Values; + OutputBytes = 0; + + // Record the memory reference base addresses. + for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { + ScopStmt *Stmt = *SI; + for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(), + E = Stmt->memacc_end(); I != E; ++I) { + Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr()); + Values.insert((BaseAddr)); + + // FIXME: we assume that there is one and only one array to be written + // in a SCoP. + int NumWrites = 0; + if ((*I)->isWrite()) { + ++NumWrites; + assert(NumWrites <= 1 && + "We support at most one array to be written in a SCoP."); + if (const PointerType * PT = + dyn_cast<PointerType>(BaseAddr->getType())) { + Type *T = PT->getArrayElementType(); + const ArrayType *ATy = dyn_cast<ArrayType>(T); + OutputBytes = getArraySizeInBytes(ATy); + } + } + } + } + + return Values; +} + +const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, + std::vector<int> &NumIters, + unsigned &LoopDepth, + unsigned &NonPLoopDepth) { + clast_stmt *Stmt = (clast_stmt *)F; + const clast_for *Result; + bool NonParaFlag = false; + LoopDepth = 0; + NonPLoopDepth = 0; + + while (Stmt) { + if (CLAST_STMT_IS_A(Stmt, stmt_for)) { + const clast_for *T = (clast_for *) Stmt; + if (isParallelFor(T)) { + if (!NonParaFlag) { + NumIters.push_back(getNumberOfIterations(T)); + Result = T; + } + } else + NonParaFlag = true; + + Stmt = T->body; + LoopDepth++; + continue; + } + Stmt = Stmt->next; + } + + assert(NumIters.size() == 4 && + "The loops should be tiled into 4-depth parallel loops and an " + "innermost non-parallel one (if exist)."); + NonPLoopDepth = LoopDepth - NumIters.size(); + assert(NonPLoopDepth <= 1 + && "We support only one innermost non-parallel loop currently."); + return (const clast_stmt *)Result->body; +} + +void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { + BasicBlock::iterator LoopBody; + SetVector<Value *> Values; + SetVector<Value *> IVS; + std::vector<int> NumIterations; + PTXGenerator::ValueToValueMapTy VMap; + + assert(!GPUTriple.empty() + && "Target triple should be set properly for GPGPU code generation."); + PTXGenerator PTXGen(Builder, P, GPUTriple); + + // Get original IVS and ScopStmt + unsigned TiledLoopDepth, NonPLoopDepth; + const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations, + TiledLoopDepth, NonPLoopDepth); + const clast_stmt *TmpStmt; + const clast_user_stmt *U; + const clast_for *InnerFor; + if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { + InnerFor = (const clast_for *)InnerStmt; + TmpStmt = InnerFor->body; + } else + TmpStmt = InnerStmt; + U = (const clast_user_stmt *) TmpStmt; + ScopStmt *Statement = (ScopStmt *) U->statement->usr; + for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { + const Value* IV = Statement->getInductionVariableForDimension(i); + IVS.insert(const_cast<Value *>(IV)); + } + + unsigned OutBytes; + Values = getGPUValues(OutBytes); + PTXGen.setOutputBytes(OutBytes); + PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); + + BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); + Builder.SetInsertPoint(LoopBody); + + BasicBlock *AfterBB = 0; + if (NonPLoopDepth) { + Value *LowerBound, *UpperBound, *IV, *Stride; + Type *IntPtrTy = getIntPtrTy(); + LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); + UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); + Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); + IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB); + const Value *OldIV_ = Statement->getInductionVariableForDimension(2); + Value *OldIV = const_cast<Value *>(OldIV_); + VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV)); + } + + updateWithValueMap(VMap, /* reverse */ false); + BlockGenerator::generate(Builder, *Statement, ValueMap, P); + updateWithValueMap(VMap, /* reverse */ true); + + if (AfterBB) + Builder.SetInsertPoint(AfterBB->begin()); + + // FIXME: The replacement of the host base address with the parameter of ptx + // subfunction should have been done by updateWithValueMap. We use the + // following codes to avoid affecting other parts of Polly. This should be + // fixed later. + Function *FN = Builder.GetInsertBlock()->getParent(); + for (unsigned j = 0; j < Values.size(); j++) { + Value *baseAddr = Values[j]; + for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { + for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) + I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); + } + } + Builder.SetInsertPoint(AfterLoop); + PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], + NumIterations[2], NumIterations[3]); + PTXGen.finishGeneration(FN); +} + bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { const clast_stmt *stmt = f->body; @@ -647,6 +833,16 @@ void ClastStmtCodeGen::codegen(const clast_for *f) { } } + if (GPGPU && isParallelFor(f)) { + if (!parallelCodeGeneration) { + parallelCodeGeneration = true; + parallelLoops.push_back(f->iterator); + codegenForGPGPU(f); + parallelCodeGeneration = false; + return; + } + } + codegenForSequential(f); } diff --git a/polly/lib/CodeGen/PTXGenerator.cpp b/polly/lib/CodeGen/PTXGenerator.cpp new file mode 100644 index 000000000000..247b657af8bf --- /dev/null +++ b/polly/lib/CodeGen/PTXGenerator.cpp @@ -0,0 +1,652 @@ +//===------ PTXGenerator.cpp - IR helper to create loops -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains functions to create GPU parallel codes as LLVM-IR. +// +//===----------------------------------------------------------------------===// + +#include "polly/CodeGen/PTXGenerator.h" +#include "polly/ScopDetection.h" +#include "polly/ScopInfo.h" + +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; +using namespace polly; + +PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P, + const std::string &Triple): + Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1), + BlockWidth(1), BlockHeight(1), OutputBytes(0) { + + InitializeGPUDataTypes(); +} + +Module *PTXGenerator::getModule() { + return Builder.GetInsertBlock()->getParent()->getParent(); +} + +Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) { + assert(NumArgs == 1 && "we support only one array access now."); + + Module *M = getModule(); + Function *F = Builder.GetInsertBlock()->getParent(); + std::vector<Type*> Arguments; + for (int i = 0; i < NumArgs; i++) + Arguments.push_back(Builder.getInt8PtrTy()); + FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); + Function *FN = Function::Create(FT, Function::InternalLinkage, + F->getName() + "_ptx_subfn", M); + FN->setCallingConv(CallingConv::PTX_Kernel); + + // Do not run any optimization pass on the new function. + P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN); + + for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI) + AI->setName("ptx.Array"); + + return FN; +} + +void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues, + SetVector<Value*> &OriginalIVS, + PTXGenerator::ValueToValueMapTy &VMap, + Function **SubFunction) { + Function *FN = createSubfunctionDefinition(UsedValues.size()); + Module *M = getModule(); + LLVMContext &Context = FN->getContext(); + IntegerType *Ty = Builder.getInt64Ty(); + + // Store the previous basic block. + BasicBlock *PrevBB = Builder.GetInsertBlock(); + + // Create basic blocks. + BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN); + BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN); + BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN); + + DominatorTree &DT = P->getAnalysis<DominatorTree>(); + DT.addNewBlock(HeaderBB, PrevBB); + DT.addNewBlock(ExitBB, HeaderBB); + DT.addNewBlock(BodyBB, HeaderBB); + + Builder.SetInsertPoint(HeaderBB); + + // Insert VMap items with maps of array base address on the host to base + // address on the device. + Function::arg_iterator AI = FN->arg_begin(); + for (unsigned j = 0; j < UsedValues.size(); j++) { + Value *BaseAddr = UsedValues[j]; + Type *ArrayTy = BaseAddr->getType(); + Value *Param = Builder.CreateBitCast(AI, ArrayTy); + VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param)); + AI++; + } + + // FIXME: These intrinsics should be inserted on-demand. However, we insert + // them all currently for simplicity. + Function *GetNctaidX = + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x); + Function *GetNctaidY = + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y); + Function *GetCtaidX = + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x); + Function *GetCtaidY = + Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y); + Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x); + Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y); + Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x); + Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y); + + Value *GridWidth = Builder.CreateCall(GetNctaidX); + GridWidth = Builder.CreateIntCast(GridWidth, Ty, false); + Value *GridHeight = Builder.CreateCall(GetNctaidY); + GridHeight = Builder.CreateIntCast(GridHeight, Ty, false); + Value *BlockWidth = Builder.CreateCall(GetNtidX); + BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false); + Value *BlockHeight = Builder.CreateCall(GetNtidY); + BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false); + Value *BIDx = Builder.CreateCall(GetCtaidX); + BIDx = Builder.CreateIntCast(BIDx, Ty, false); + Value *BIDy = Builder.CreateCall(GetCtaidY); + BIDy = Builder.CreateIntCast(BIDy, Ty, false); + Value *TIDx = Builder.CreateCall(GetTidX); + TIDx = Builder.CreateIntCast(TIDx, Ty, false); + Value *TIDy = Builder.CreateCall(GetTidY); + TIDy = Builder.CreateIntCast(TIDy, Ty, false); + + Builder.CreateBr(BodyBB); + Builder.SetInsertPoint(BodyBB); + + unsigned NumDims = OriginalIVS.size(); + std::vector<Value *> Substitutions; + Value *BlockID, *ThreadID; + switch (NumDims) { + case 1: { + Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight, + "p_gpu_blocksize"); + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); + BlockID = Builder.CreateAdd(BlockID, BIDx); + BlockID = Builder.CreateMul(BlockID, BlockSize); + ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); + ThreadID = Builder.CreateAdd(ThreadID, TIDx); + ThreadID = Builder.CreateAdd(ThreadID, BlockID); + Substitutions.push_back(ThreadID); + break; + } + case 2: { + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); + BlockID = Builder.CreateAdd(BlockID, BIDx); + Substitutions.push_back(BlockID); + ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j"); + ThreadID = Builder.CreateAdd(ThreadID, TIDx); + Substitutions.push_back(ThreadID); + break; + } + case 3: { + BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i"); + BlockID = Builder.CreateAdd(BlockID, BIDx); + Substitutions.push_back(BlockID); + Substitutions.push_back(TIDy); + Substitutions.push_back(TIDx); + break; + } + case 4: { + Substitutions.push_back(BIDy); + Substitutions.push_back(BIDx); + Substitutions.push_back(TIDy); + Substitutions.push_back(TIDx); + break; + } + default: + assert(true && + "We cannot transform parallel loops whose depth is larger than 4."); + return; + } + + assert(OriginalIVS.size() == Substitutions.size() + && "The size of IVS should be equal to the size of substitutions."); + for (unsigned i = 0; i < OriginalIVS.size(); ++i) { + VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i], + Substitutions[i])); + } + + Builder.CreateBr(ExitBB); + Builder.SetInsertPoint(--Builder.GetInsertPoint()); + BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); + + // Add the termination of the ptx-device subfunction. + Builder.SetInsertPoint(ExitBB); + Builder.CreateRetVoid(); + + Builder.SetInsertPoint(LoopBody); + *SubFunction = FN; +} + +void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues, + SetVector<Value*> &OriginalIVS, + ValueToValueMapTy &VMap, + BasicBlock::iterator *LoopBody) { + Function *SubFunction; + BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint(); + createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction); + *LoopBody = Builder.GetInsertPoint(); + Builder.SetInsertPoint(PrevInsertPoint); +} + +IntegerType *PTXGenerator::getInt64Type() { + return Builder.getInt64Ty(); +} + +PointerType *PTXGenerator::getI8PtrType() { + return PointerType::getUnqual(Builder.getInt8Ty()); +} + +PointerType *PTXGenerator::getPtrI8PtrType() { + return PointerType::getUnqual(getI8PtrType()); +} + +PointerType *PTXGenerator::getFloatPtrType() { + return llvm::Type::getFloatPtrTy(getModule()->getContext()); +} + +PointerType *PTXGenerator::getGPUContextPtrType() { + return PointerType::getUnqual(ContextTy); +} + +PointerType *PTXGenerator::getGPUModulePtrType() { + return PointerType::getUnqual(ModuleTy); +} + +PointerType *PTXGenerator::getGPUDevicePtrType() { + return PointerType::getUnqual(DeviceTy); +} + +PointerType *PTXGenerator::getPtrGPUDevicePtrType() { + return PointerType::getUnqual(DevDataTy); +} + +PointerType *PTXGenerator::getGPUFunctionPtrType() { + return PointerType::getUnqual(KernelTy); +} + +PointerType *PTXGenerator::getGPUEventPtrType() { + return PointerType::getUnqual(EventTy); +} + +void PTXGenerator::InitializeGPUDataTypes() { + LLVMContext &Context = getModule()->getContext(); + + ContextTy = StructType::create(Context, "struct.PollyGPUContextT"); + ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT"); + KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT"); + DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT"); + DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT"); + EventTy = StructType::create(Context, "struct.PollyGPUEventT"); +} + +void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) { + const char *Name = "polly_initDevice"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(PointerType::getUnqual(getGPUContextPtrType())); + Args.push_back(PointerType::getUnqual(getGPUDevicePtrType())); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall2(F, Context, Device); +} + +void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) { + const char *Name = "polly_getPTXModule"; + llvm::Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getI8PtrType()); + Args.push_back(PointerType::getUnqual(getGPUModulePtrType())); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall2(F, Buffer, Module); +} + +void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module, + Value *Kernel) { + const char *Name = "polly_getPTXKernelEntry"; + llvm::Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getI8PtrType()); + Args.push_back(getGPUModulePtrType()); + Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType())); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, Entry, Module, Kernel); +} + +void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData, + Value *DeviceData, + Value *Size) { + const char *Name = "polly_allocateMemoryForHostAndDevice"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getPtrI8PtrType()); + Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType())); + Args.push_back(getInt64Type()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, HostData, DeviceData, Size); +} + +void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData, + Value *HostData, + Value *Size) { + const char *Name = "polly_copyFromHostToDevice"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getPtrGPUDevicePtrType()); + Args.push_back(getI8PtrType()); + Args.push_back(getInt64Type()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, DeviceData, HostData, Size); +} + +void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData, + Value *DeviceData, + Value *Size) { + const char *Name = "polly_copyFromDeviceToHost"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getI8PtrType()); + Args.push_back(getPtrGPUDevicePtrType()); + Args.push_back(getInt64Type()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, HostData, DeviceData, Size); +} + +void PTXGenerator::createCallSetKernelParameters(Value *Kernel, + Value *BlockWidth, + Value *BlockHeight, + Value *DeviceData) { + const char *Name = "polly_setKernelParameters"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getGPUFunctionPtrType()); + Args.push_back(getInt64Type()); + Args.push_back(getInt64Type()); + Args.push_back(getPtrGPUDevicePtrType()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData); +} + +void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth, + Value *GridHeight) { + const char *Name = "polly_launchKernel"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getGPUFunctionPtrType()); + Args.push_back(getInt64Type()); + Args.push_back(getInt64Type()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, Kernel, GridWidth, GridHeight); +} + +void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent, + Value *StopEvent) { + const char *Name = "polly_startTimerByCudaEvent"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); + Args.push_back(PointerType::getUnqual(getGPUEventPtrType())); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall2(F, StartEvent, StopEvent); +} + +void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent, + Value *StopEvent, + Value *Timer) { + const char *Name = "polly_stopTimerByCudaEvent"; + Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getGPUEventPtrType()); + Args.push_back(getGPUEventPtrType()); + Args.push_back(getFloatPtrType()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall3(F, StartEvent, StopEvent, Timer); +} + +void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData, + Value *DeviceData, + Value *Module, + Value *Context, + Value *Kernel) { + const char *Name = "polly_cleanupGPGPUResources"; + llvm::Module *M = getModule(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + std::vector<Type*> Args; + Args.push_back(getI8PtrType()); + Args.push_back(getPtrGPUDevicePtrType()); + Args.push_back(getGPUModulePtrType()); + Args.push_back(getGPUContextPtrType()); + Args.push_back(getGPUFunctionPtrType()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel); +} + +Value *PTXGenerator::getCUDAGridWidth() { + return ConstantInt::get(getInt64Type(), GridWidth); +} + +Value *PTXGenerator::getCUDAGridHeight() { + return ConstantInt::get(getInt64Type(), GridHeight); +} + +Value *PTXGenerator::getCUDABlockWidth() { + return ConstantInt::get(getInt64Type(), BlockWidth); +} + +Value *PTXGenerator::getCUDABlockHeight() { + return ConstantInt::get(getInt64Type(), BlockHeight); +} + +Value *PTXGenerator::getOutputArraySizeInBytes() { + return ConstantInt::get(getInt64Type(), OutputBytes); +} + +Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) { + Module *M = getModule(); + std::string LLVMKernelStr; + raw_string_ostream NameROS(LLVMKernelStr); + formatted_raw_ostream FOS(NameROS); + FOS << "target triple = \"" << GPUTriple <<"\"\n"; + SubFunction->print(FOS); + + // Insert ptx intrinsics into the kernel string. + for (Module::iterator I = M->begin(), E = M->end(); I != E; ) { + Function *F = I++; + // Function must be a prototype and unused. + if (F->isDeclaration() && F->isIntrinsic()) { + switch (F->getIntrinsicID()) { + case Intrinsic::ptx_read_nctaid_x: + case Intrinsic::ptx_read_nctaid_y: + case Intrinsic::ptx_read_ctaid_x: + case Intrinsic::ptx_read_ctaid_y: + case Intrinsic::ptx_read_ntid_x: + case Intrinsic::ptx_read_ntid_y: + case Intrinsic::ptx_read_tid_x: + case Intrinsic::ptx_read_tid_y: + F->print(FOS); + break; + default: + break; + } + } + } + + Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr, + "llvm_kernel"); + Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu"); + Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features"); + + Function *GetDeviceKernel = Intrinsic::getDeclaration(M, + Intrinsic::codegen); + + return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features); +} + +Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) { + StringRef Entry = SubFunction->getName(); + return Builder.CreateGlobalStringPtr(Entry, "ptx_entry"); +} + +void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) { + Module *M = getModule(); + SubFunction->eraseFromParent(); + + if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) + FuncPTXReadNCtaidX->eraseFromParent(); + + if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) + FuncPTXReadNCtaidY->eraseFromParent(); + + if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) + FuncPTXReadCtaidX->eraseFromParent(); + + if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) + FuncPTXReadCtaidY->eraseFromParent(); + + if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) + FuncPTXReadNTidX->eraseFromParent(); + + if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) + FuncPTXReadNTidY->eraseFromParent(); + + if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) + FuncPTXReadTidX->eraseFromParent(); + + if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) + FuncPTXReadTidY->eraseFromParent(); +} + +void PTXGenerator::finishGeneration(Function *F) { + // Define data used by the GPURuntime library. + AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0, + "phcontext"); + AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0, + "phdevice"); + AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0, + "phmodule"); + AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0, + "phkernel"); + AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0, + "pstart_timer"); + AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0, + "pstop_timer"); + AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0, + "pdevice_data"); + AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0, + "phost_data"); + Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext()); + AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer"); + + // Initialize the GPU device. + createCallInitDevice(PtrCUContext, PtrCUDevice); + + // Create the GPU kernel module and entry function. + Value *PTXString = createPTXKernelFunction(F); + Value *PTXEntry = getPTXKernelEntryName(F); + createCallGetPTXModule(PTXString, PtrCUModule); + LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule"); + createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel); + + // Allocate device memory and its corresponding host memory. + createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData, + getOutputArraySizeInBytes()); + + // Get the pointer to the device memory and set the GPU execution parameters. + LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data"); + LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel"); + createCallSetKernelParameters(CUKernel, getCUDABlockWidth(), + getCUDABlockHeight(), DData); + + // Create the start and end timer and record the start time. + createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent); + + // Launch the GPU kernel. + createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight()); + + // Copy the results back from the GPU to the host. + LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data"); + createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes()); + + // Record the end time. + LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer"); + LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer"); + createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent, + PtrElapsedTimes); + + // Cleanup all the resources used. + LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext"); + createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext, + CUKernel); + + // Erase the ptx kernel and device subfunctions and ptx intrinsics from + // current module. + eraseUnusedFunctions(F); +} diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c new file mode 100644 index 000000000000..b6397d194290 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c @@ -0,0 +1,16 @@ +int A[128][128]; + +int gpu_pure() { + int i,j; + + for(i = 0; i < 128; i++) + for(j = 0; j < 128; j++) + A[i][j] = i*128 + j; + + return 0; +} + +int main() { + int b = gpu_pure(); + return 0; +} diff --git a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll new file mode 100644 index 000000000000..820280367ab6 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll @@ -0,0 +1,65 @@ +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s +; ModuleID = '2d_innermost_parallel.s' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [128 x [128 x i32]] zeroinitializer, align 16 + +define i32 @gpu_pure() nounwind uwtable { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc6, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ] + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32 + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128 + br i1 %exitcond6, label %for.body, label %for.end8 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond, label %for.body3, label %for.end + +for.body3: ; preds = %for.cond1 + %tmp = shl nsw i64 %indvars.iv2, 7 + %tmp7 = add nsw i64 %tmp, %indvars.iv + %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv + %tmp8 = trunc i64 %tmp7 to i32 + store i32 %tmp8, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body3 + %indvars.iv.next = add i64 %indvars.iv, 1 + br label %for.cond1 + +for.end: ; preds = %for.cond1 + br label %for.inc6 + +for.inc6: ; preds = %for.end + %indvars.iv.next3 = add i64 %indvars.iv2, 1 + br label %for.cond + +for.end8: ; preds = %for.cond + ret i32 0 +} + +define i32 @main() nounwind uwtable { +entry: + %call = call i32 @gpu_pure() + ret i32 0 +} + +; CHECK: call void @polly_initDevice +; CHECK: call void @polly_getPTXModule +; CHECK: call void @polly_getPTXKernelEntry +; CHECK: call void @polly_allocateMemoryForHostAndDevice +; CHECK: call void @polly_setKernelParameters +; CHECK: call void @polly_startTimerByCudaEvent +; CHECK: call void @polly_launchKernel +; CHECK: call void @polly_copyFromDeviceToHost +; CHECK: call void @polly_stopTimerByCudaEvent +; CHECK: call void @polly_cleanupGPGPUResources diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c new file mode 100644 index 000000000000..dae115ea3b44 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c @@ -0,0 +1,17 @@ +int A[128][128]; + +int gpu_no_pure() { + int i,j,k; + + for(i = 0; i < 128; i++) + for(j = 0; j < 128; j++) + for(k = 0; k < 256; k++) + A[i][j] += i*123/(k+1)+5-j*k-123; + + return 0; +} + +int main() { + int b = gpu_no_pure(); + return 0; +} diff --git a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll new file mode 100644 index 000000000000..588f581b2ef3 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll @@ -0,0 +1,88 @@ +; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s +; ModuleID = '3d_innermost_non_parallel.s' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@A = common global [128 x [128 x i32]] zeroinitializer, align 16 + +define i32 @gpu_no_pure() nounwind uwtable { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc16, %entry + %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ] + %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32 + %exitcond6 = icmp ne i32 %lftr.wideiv5, 128 + br i1 %exitcond6, label %for.body, label %for.end18 + +for.body: ; preds = %for.cond + br label %for.cond1 + +for.cond1: ; preds = %for.inc13, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ] + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond1 = icmp ne i32 %lftr.wideiv, 128 + br i1 %exitcond1, label %for.body3, label %for.end15 + +for.body3: ; preds = %for.cond1 + br label %for.cond4 + +for.cond4: ; preds = %for.inc, %for.body3 + %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ] + %exitcond = icmp ne i32 %k.0, 256 + br i1 %exitcond, label %for.body6, label %for.end + +for.body6: ; preds = %for.cond4 + %tmp = mul nsw i64 %indvars.iv2, 123 + %add = add nsw i32 %k.0, 1 + %tmp7 = trunc i64 %tmp to i32 + %div = sdiv i32 %tmp7, %add + %add7 = add nsw i32 %div, 5 + %tmp8 = trunc i64 %indvars.iv to i32 + %mul8 = mul nsw i32 %tmp8, %k.0 + %sub = sub nsw i32 %add7, %mul8 + %sub9 = add nsw i32 %sub, -123 + %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv + %tmp9 = load i32* %arrayidx11, align 4 + %add12 = add nsw i32 %tmp9, %sub9 + store i32 %add12, i32* %arrayidx11, align 4 + br label %for.inc + +for.inc: ; preds = %for.body6 + %inc = add nsw i32 %k.0, 1 + br label %for.cond4 + +for.end: ; preds = %for.cond4 + br label %for.inc13 + +for.inc13: ; preds = %for.end + %indvars.iv.next = add i64 %indvars.iv, 1 + br label %for.cond1 + +for.end15: ; preds = %for.cond1 + br label %for.inc16 + +for.inc16: ; preds = %for.end15 + %indvars.iv.next3 = add i64 %indvars.iv2, 1 + br label %for.cond + +for.end18: ; preds = %for.cond + ret i32 0 +} + +define i32 @main() nounwind uwtable { +entry: + %call = call i32 @gpu_no_pure() + ret i32 0 +} + +; CHECK: call void @polly_initDevice +; CHECK: call void @polly_getPTXModule +; CHECK: call void @polly_getPTXKernelEntry +; CHECK: call void @polly_allocateMemoryForHostAndDevice +; CHECK: call void @polly_setKernelParameters +; CHECK: call void @polly_startTimerByCudaEvent +; CHECK: call void @polly_launchKernel +; CHECK: call void @polly_copyFromDeviceToHost +; CHECK: call void @polly_stopTimerByCudaEvent +; CHECK: call void @polly_cleanupGPGPUResources diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop new file mode 100644 index 000000000000..0d7a260a7002 --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop @@ -0,0 +1,21 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end18", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu new file mode 100644 index 000000000000..6f007fa7864a --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu @@ -0,0 +1,21 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end18", + "statements" : [ + { + "accesses" : [ + { + "kind" : "read", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + }, + { + "kind" : "write", + "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }", + "name" : "Stmt_for_body6", + "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop new file mode 100644 index 000000000000..693c5097312d --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop @@ -0,0 +1,17 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end8", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }" + } + ] +} diff --git a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu new file mode 100644 index 000000000000..fef61050e93a --- /dev/null +++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu @@ -0,0 +1,17 @@ +{ + "context" : "{ : }", + "name" : "for.cond => for.end8", + "statements" : [ + { + "accesses" : [ + { + "kind" : "write", + "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }" + } + ], + "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }", + "name" : "Stmt_for_body3", + "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }" + } + ] +} |