polly/tools/GPURuntime/GPUJIT.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

/******************************************************************************/
/*                                                                            */
/* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
/* Exceptions.                                                                */
/* See https://llvm.org/LICENSE.txt for license information.                  */
/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
/*                                                                            */
/******************************************************************************/
/*                                                                            */
/*  This file defines GPUJIT.                                                 */
/*                                                                            */
/******************************************************************************/

#ifndef GPUJIT_H_
#define GPUJIT_H_
#include "stddef.h"

/*
 * The following demonstrates how we can use the GPURuntime library to
 * execute a GPU kernel.
 *
 * char KernelString[] = "\n\
 *   .version 1.4\n\
 *   .target sm_10, map_f64_to_f32\n\
 *   .entry _Z8myKernelPi (\n\
 *   .param .u64 __cudaparm__Z8myKernelPi_data)\n\
 *   {\n\
 *     .reg .u16 %rh<4>;\n\
 *     .reg .u32 %r<5>;\n\
 *     .reg .u64 %rd<6>;\n\
 *     cvt.u32.u16     %r1, %tid.x;\n\
 *     mov.u16         %rh1, %ctaid.x;\n\
 *     mov.u16         %rh2, %ntid.x;\n\
 *     mul.wide.u16    %r2, %rh1, %rh2;\n\
 *     add.u32         %r3, %r1, %r2;\n\
 *     ld.param.u64    %rd1, [__cudaparm__Z8myKernelPi_data];\n\
 *     cvt.s64.s32     %rd2, %r3;\n\
 *     mul.wide.s32    %rd3, %r3, 4;\n\
 *     add.u64         %rd4, %rd1, %rd3;\n\
 *     st.global.s32   [%rd4+0], %r3;\n\
 *     exit;\n\
 *   }\n\
 * ";
 *
 * const char *Entry = "_Z8myKernelPi";
 *
 * int main() {
 *   PollyGPUFunction *Kernel;
 *   PollyGPUContext *Context;
 *   PollyGPUDevicePtr *DevArray;
 *   int *HostData;
 *   int MemSize;
 *
 *   int GridX = 8;
 *   int GridY = 8;
 *
 *   int BlockX = 16;
 *   int BlockY = 16;
 *   int BlockZ = 1;
 *
 *   MemSize = 256*64*sizeof(int);
 *   Context = polly_initContext();
 *   DevArray = polly_allocateMemoryForDevice(MemSize);
 *   Kernel = polly_getKernel(KernelString, KernelName);
 *
 *   void *Params[1];
 *   void *DevPtr = polly_getDevicePtr(DevArray)
 *   Params[0] = &DevPtr;
 *
 *   polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
 *
 *   polly_copyFromDeviceToHost(HostData, DevData, MemSize);
 *   polly_freeKernel(Kernel);
 *   polly_freeDeviceMemory(DevArray);
 *   polly_freeContext(Context);
 * }
 *
 */

typedef enum PollyGPURuntimeT {
  RUNTIME_NONE,
  RUNTIME_CUDA,
  RUNTIME_CL
} PollyGPURuntime;

typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;

typedef struct OpenCLContextT OpenCLContext;
typedef struct OpenCLKernelT OpenCLKernel;
typedef struct OpenCLDevicePtrT OpenCLDevicePtr;

typedef struct CUDAContextT CUDAContext;
typedef struct CUDAKernelT CUDAKernel;
typedef struct CUDADevicePtrT CUDADevicePtr;

PollyGPUContext *polly_initContextCUDA();
PollyGPUContext *polly_initContextCL();
PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
                                  const char *KernelName);
void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                long MemSize);
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
                                long MemSize);
void polly_synchronizeDevice();
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                        unsigned int GridDimY, unsigned int BlockSizeX,
                        unsigned int BlockSizeY, unsigned int BlockSizeZ,
                        void **Parameters);
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
void polly_freeContext(PollyGPUContext *Context);

// Note that polly_{malloc/free}Managed are currently not used by Polly.
// We use them in COSMO by replacing all malloc with polly_mallocManaged and all
// frees with cudaFree, so we can get managed memory "automatically".
// Needless to say, this is a hack.
// Please make sure that this code is not present in Polly when 2018 rolls in.
// If this is still present, ping Siddharth Bhat <siddu.druid@gmail.com>
void *polly_mallocManaged(size_t size);
void polly_freeManaged(void *mem);
#endif /* GPUJIT_H_ */