diff options
author | Michael Kruse <llvm-project@meinersbur.de> | 2023-01-25 14:03:57 -0600 |
---|---|---|
committer | Michael Kruse <llvm-project@meinersbur.de> | 2023-03-08 17:33:04 -0600 |
commit | 19afbfe33156d211fa959dadeea46cd17b9c723c (patch) | |
tree | db53498143b16127c6c0e22a671a8d11eece4152 /polly/test | |
parent | 115c7beda74f3cfaf83b91d14bc97a39bff4cf19 (diff) | |
download | llvm-19afbfe33156d211fa959dadeea46cd17b9c723c.tar.gz |
[Polly] Remove Polly-ACC.
Polly-ACC is unmaintained and since it has never been ported to the NPM pipeline, since D136621 it is not even accessible anymore without manually specifying the passes on the `opt` command line.
Since there is no plan to put it to a maintainable state, remove it from Polly.
Reviewed By: grosser
Differential Revision: https://reviews.llvm.org/D142580
Diffstat (limited to 'polly/test')
76 files changed, 0 insertions, 6002 deletions
diff --git a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll deleted file mode 100644 index 3f4c4a0aa610..000000000000 --- a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll +++ /dev/null @@ -1,9 +0,0 @@ -define float @__nv_expf(float %a) { - ret float %a -} -define float @__nv_cosf(float %a) { - ret float %a -} -define float @__nv_logf(float %a) { - ret float %a -} diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll deleted file mode 100644 index 64b4cc4aa100..000000000000 --- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop. -; SCOP: Function: checkScalarKill -; SCOP-NEXT: Region: %XLoopInit---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; Check that we have a scalar that is not a phi node in the scop. -; SCOP: i32 MemRef_x_0; // Element size 4 - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; Check that we add variables that are local to a scop into the kills that we -; pass to PPCG. This should enable PPCG to codegen this example. -; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) { -; int x; -; #pragma scop -; for(int i = 0; i < 1000; i++) { -; XLoopInit: x = 0; -; -; if (control1 > 2) -; C1Add: x += 10; -; if (control2 > 3) -; C2Add: x += A[i]; -; -; BLoopAccumX: B[i] += x; -; } -; -; #pragma endscop -; } -; ModuleID = 'test.ll' -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %XLoopInit - -XLoopInit: ; preds = %entry.split, %BLoopAccumX - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ] - %cmp1 = icmp sgt i32 %control1, 2 - %x.0 = select i1 %cmp1, i32 10, i32 0 - %cmp2 = icmp sgt i32 %control2, 3 - br i1 %cmp2, label %C2Add, label %BLoopAccumX - -C2Add: ; preds = %XLoopInit - %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp6 = load i32, ptr %arrayidx, align 4 - %add4 = add nsw i32 %tmp6, %x.0 - br label %BLoopAccumX - -BLoopAccumX: ; preds = %XLoopInit, %C2Add - %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ] - %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - %tmp11 = load i32, ptr %arrayidx7, align 4 - %add8 = add nsw i32 %tmp11, %x.1 - store i32 %add8, ptr %arrayidx7, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %XLoopInit, label %for.end - -for.end: ; preds = %BLoopAccumX - ret void -} diff --git a/polly/test/GPGPU/align-params-in-schedule.ll b/polly/test/GPGPU/align-params-in-schedule.ll deleted file mode 100644 index fa9a8f3eb4e5..000000000000 --- a/polly/test/GPGPU/align-params-in-schedule.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: polly_launchKernel - -; Verify that this program compiles. At some point, this compilation crashed -; due to insufficient parameters being available. - -source_filename = "bugpoint-output-4d01492.bc" -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] } -%struct.widget = type { i64, i64, i64 } - -@global = external unnamed_addr global %struct.barney, align 32 - -; Function Attrs: nounwind uwtable -define void @wobble(ptr noalias %arg) #0 { -bb: - %tmp = load i32, ptr %arg, align 4 - br label %bb1 - -bb1: ; preds = %bb13, %bb - %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ] - br label %bb3 - -bb3: ; preds = %bb3, %bb1 - %tmp4 = load ptr, ptr @global, align 32 - %tmp5 = sext i32 %tmp2 to i64 - %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8 - %tmp7 = mul i64 %tmp6, %tmp5 - %tmp8 = add i64 %tmp7, 0 - %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8 - %tmp10 = add i64 %tmp8, %tmp9 - %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10 - store i32 undef, ptr %tmp11, align 4 - %tmp12 = icmp eq i32 0, 0 - br i1 %tmp12, label %bb13, label %bb3 - -bb13: ; preds = %bb3 - %tmp14 = icmp eq i32 %tmp2, %tmp - %tmp15 = add i32 %tmp2, 1 - br i1 %tmp14, label %bb16, label %bb1 - -bb16: ; preds = %bb13 - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll deleted file mode 100644 index 12b872d55192..000000000000 --- a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-use-llvm-names < %s -; ModuleID = 'test/GPGPU/zero-size-array.ll' - -; REQUIRES: pollyacc - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - - -; We used to divide the element size by 8 to arrive at the 'actual' size -; of an array element. This used to cause arrays that have an element size -; of less than 8 to collapse to size 0. This test makes sure that it does -; not happen anymore. - -; f(int *niters_ptr, int *arr[0]) { -; const int inters = *niters_ptr; -; for(int i = 0; i < niters; i++) { -; arr[0][i + 1] = 0 -; } -; } - -; Function Attrs: nounwind uwtable -define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 { -entry: - %niters = load i32, ptr %niters.ptr, align 4 - br label %loop.body - -loop.body: ; preds = %loop.body, %entry - %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ] - %indvar.sext = sext i32 %indvar to i64 - %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext - store i32 0, ptr %arr.slot, align 4 - %tmp8 = icmp eq i32 %indvar, %niters - %indvar.next = add i32 %indvar, 1 - br i1 %tmp8, label %loop.exit, label %loop.body - -loop.exit: ; preds = %loop.body - %tmp10 = icmp sgt i32 undef, 0 - br label %auxiliary.loop - -auxiliary.loop: ; preds = %"101", %loop.exit - %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ] - br i1 undef, label %auxiliary.loop, label %exit - -exit: ; preds = %auxiliary.loop - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll deleted file mode 100644 index a60744289885..000000000000 --- a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-ignore-parameter-bounds \ -; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR -; -; REQUIRES: pollyacc - -; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain -; all the parameters present in the program. -; -; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff` -; to have the same parameter dimensions. To achieve this, we used to realign -; every `pw_aff` with `Scop::Context`. However, in conjunction with -; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context` -; does not contain all parameters. -; -; We check that Polly does the right thing in this case and sets up the parameter -; dimensions correctly. - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) -; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll' - -; C pseudocode -; ------------ -; void f(int *arr, long niters, long stride) { -; for(int i = 0; i < niters; i++) { -; arr[i * stride] = 1; -; } -; } - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind uwtable -define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 { -entry: - br label %loop - -loop: ; preds = %loop, %entry - %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ] - %idx = mul nuw nsw i64 %indvar, %stride - %slot = getelementptr i32, ptr %arr, i64 %idx - store i32 1, ptr %slot, align 4 - %indvar.next = add nuw nsw i64 %indvar, 1 - %check = icmp sgt i64 %indvar.next, %niters - br i1 %check, label %exit, label %loop - -exit: ; preds = %loop - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind uwtable } diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll deleted file mode 100644 index cbb0296d48ef..000000000000 --- a/polly/test/GPGPU/cuda-annotations.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 { - -; KERNEL: !nvvm.annotations = !{!0} - -; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp4 = load i64, ptr %tmp3, align 8 - %tmp5 = add nsw i64 %tmp4, 100 - store i64 %tmp5, ptr %tmp3, align 8 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll deleted file mode 100644 index 8ef7e336cfad..000000000000 --- a/polly/test/GPGPU/cuda-managed-memory-simple.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; -; #include <cuda_runtime.h> -; -; static const int N = 45; -; -; void copy(int *R, int *A) { -; for (int i = 0; i < N; i++) { -; R[i] = A[i] * 10; -; } -; } -; -; int main() { -; int *A, *R; -; -; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal); -; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal); -; -; for (int i = 0; i < N; i++) { -; A[i] = i; -; R[i] = 0; -; } -; copy(R, A); -; -; return 0; -; } -; - -; CHECK-NOT: polly_copyFromHostToDevice -; CHECK-NOT: polly_copyFromDeviceToHost -; CHECK-NOT: polly_freeDeviceMemory -; CHECK-NOT: polly_allocateMemoryForDevice - -; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA() -; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8* -; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8* -; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 -; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0 -; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8* -; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]] -; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 -; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1 -; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8* -; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]] -; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]]) -; CHECK-NEXT: call void @polly_synchronizeDevice() -; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]]) - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @copy(i32* %R, i32* %A) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - %exitcond = icmp ne i64 %indvars.iv, 45 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %tmp = load i32, i32* %arrayidx, align 4 - %mul = mul nsw i32 %tmp, 10 - %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv - store i32 %mul, i32* %arrayidx2, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br label %for.cond - -for.end: ; preds = %for.cond - ret void -} - -define i32 @main() { -entry: - %A = alloca i32*, align 8 - %R = alloca i32*, align 8 - %tmp = bitcast i32** %A to i8** - %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2 - %tmp1 = bitcast i32** %R to i8** - %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2 - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - %exitcond = icmp ne i64 %indvars.iv, 45 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %tmp2 = load i32*, i32** %A, align 8 - %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv - %tmp3 = trunc i64 %indvars.iv to i32 - store i32 %tmp3, i32* %arrayidx, align 4 - %tmp4 = load i32*, i32** %R, align 8 - %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv - store i32 0, i32* %arrayidx3, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br label %for.cond - -for.end: ; preds = %for.cond - %tmp5 = load i32*, i32** %R, align 8 - %tmp6 = load i32*, i32** %A, align 8 - call void @copy(i32* %tmp5, i32* %tmp6) - ret i32 0 -} - -declare i32 @cudaMallocManaged(i8**, i64, i32) #1 diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll deleted file mode 100644 index c90926c318e8..000000000000 --- a/polly/test/GPGPU/debug-metadata-leak.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: | FileCheck --check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 { - -; The instruction marked <<<LeakyInst>>> is copied into the GPUModule, -; with changes only to the parameters to access data on the device instead of -; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the -; instruction is annotated with a DILocation, copying the instruction also copies -; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by -; failing the verification of the Module in GPUNodeBuilder::finalize, due to the -; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied -; nor created. -; -; https://reviews.llvm.org/D35630 removes this debug metadata before the -; instruction is copied to the GPUModule. -; -; vec_add_1.c: -; void vec_add_1(int N, int arr[N]) { -; int i=0; -; for( i=0 ; i<N ; i++) arr[i] += 1; -; } -; -source_filename = "vec_add_1.c" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 { -entry: - call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17 - call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18 - call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19 - %tmp = sext i32 %N to i64, !dbg !20 - br label %for.cond, !dbg !20 - -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19 - %cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22 - br i1 %cmp, label %for.body, label %for.end, !dbg !24 - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25 - %tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27 - %add = add nsw i32 %tmp1, 1, !dbg !26 ; <<<LeakyInst>>> - store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27 - br label %for.inc, !dbg !25 - -for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31 - call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19 - br label %for.cond, !dbg !32, !llvm.loop !33 - -for.end: ; preds = %for.cond - ret void, !dbg !35 -} - -declare void @llvm.dbg.declare(metadata, metadata, metadata) - -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) - - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) -!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 5.0.0"} -!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) -!8 = !DISubroutineType(types: !9) -!9 = !{null, !10, !11} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) -!12 = !{!13, !14, !15} -!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10) -!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11) -!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10) -!16 = !DIExpression() -!17 = !DILocation(line: 1, column: 20, scope: !7) -!18 = !DILocation(line: 1, column: 27, scope: !7) -!19 = !DILocation(line: 2, column: 7, scope: !7) -!20 = !DILocation(line: 3, column: 8, scope: !21) -!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) -!22 = !DILocation(line: 3, column: 15, scope: !23) -!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3) -!24 = !DILocation(line: 3, column: 3, scope: !21) -!25 = !DILocation(line: 3, column: 25, scope: !23) -!26 = !DILocation(line: 3, column: 32, scope: !23) -!27 = !{!28, !28, i64 0} -!28 = !{!"int", !29, i64 0} -!29 = !{!"omnipotent char", !30, i64 0} -!30 = !{!"Simple C/C++ TBAA"} -!31 = !DILocation(line: 3, column: 21, scope: !23) -!32 = !DILocation(line: 3, column: 3, scope: !23) -!33 = distinct !{!33, !24, !34} -!34 = !DILocation(line: 3, column: 35, scope: !21) -!35 = !DILocation(line: 4, column: 1, scope: !7) diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll deleted file mode 100644 index 4aeee035a407..000000000000 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ /dev/null @@ -1,254 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=SCHED %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-ASM - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today due to extensive output differences from when the test was written. - -; CHECK: Stmt_bb5 -; CHECK-NEXT: Domain := -; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }; -; CHECK-NEXT: Schedule := -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] }; -; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; -; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] -; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] }; - -; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }" -; SCHED-NEXT: child: -; SCHED-NEXT: context: "{ [] }" -; SCHED-NEXT: child: -; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: sequence: -; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: set: -; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" -; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" -; SCHED-NEXT: child: -; SCHED-NEXT: mark: "kernel" -; SCHED-NEXT: child: -; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" -; SCHED-NEXT: child: -; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }" -; SCHED-NEXT: child: -; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]" -; SCHED-NEXT: permutable: 1 -; SCHED-NEXT: coincident: [ 1, 1 ] -; SCHED-NEXT: child: -; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }" -; SCHED-NEXT: child: -; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]" -; SCHED-NEXT: permutable: 1 -; SCHED-NEXT: coincident: [ 1, 1 ] -; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: set: -; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }" -; SCHED-NEXT: child: -; SCHED-NEXT: guard: "{ [] }" - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(16, 32); -; CODE-NEXT: dim3 k0_dimGrid(32, 32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3); - -; IR: polly.split_new_and_old: -; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024) -; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1 -; IR-NEXT: %polly.overflow.state = or i1 false, %.obit -; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0 -; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024) -; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1 -; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1 -; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0 -; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3) -; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1 -; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4 -; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0 -; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6) -; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1 -; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7 -; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0 -; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440 -; IR-NEXT: %5 = and i1 true, %4 -; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true -; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown -; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2 - -; IR: polly.start: -; IR-NEXT: br label %polly.acc.initialize - -; IR: polly.acc.initialize: -; IR-NEXT: [[GPUContext:%.*]] = call ptr @polly_initContext() -; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304) -; IR-NEXT: call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304) -; IR-NEXT: [[DevPtr:%.*]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) -; IR-NEXT: store ptr [[DevPtr]], ptr %polly_launch_0_param_0 -; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params -; IR-NEXT: call ptr @polly_getKernel -; IR-NEXT: call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr) -; IR-NEXT: call void @polly_freeKernel -; IR-NEXT: call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304) -; IR-NEXT: call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A) -; IR-NEXT: call void @polly_freeContext(ptr [[GPUContext]]) -; IR-NEXT: br label %polly.exiting - -; IR: polly.exiting: -; IR-NEXT: br label %polly.merge_new_and_old - -; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 { -; KERNEL-IR-NEXT: entry: -; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() -; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64 -; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64 -; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() -; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64 -; KERNEL-IR-NEXT: br label %polly.loop_preheader - -; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 -; KERNEL-IR-NEXT: ret void - -; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader -; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] -; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0 -; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1 -; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8 -; KERNEL-IR-NEXT: br label %polly.stmt.bb5 - -; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header -; KERNEL-IR-NEXT: %10 = mul i64 %5, %9 -; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float -; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 -; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1 -; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 -; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A -; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4 -; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 -; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 -; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1 -; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1 -; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar -; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 -; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3 -; KERNEL-IR-NEXT: store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4 -; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0 -; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry -; KERNEL-IR-NEXT: br label %polly.loop_header - -; KERNEL-IR: attributes #0 = { "polly.skip.fn" } - -; KERNEL-ASM: .version 3.2 -; KERNEL-ASM-NEXT: .target sm_30 -; KERNEL-ASM-NEXT: .address_size 64 - -; KERNEL-ASM: // .globl kernel_0 - -; KERNEL-ASM: .visible .entry kernel_0( -; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0 -; KERNEL-ASM-NEXT: ) - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop(ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, ptr %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/failing-invariant-load-handling.ll b/polly/test/GPGPU/failing-invariant-load-handling.ll deleted file mode 100644 index 70f88667bd60..000000000000 --- a/polly/test/GPGPU/failing-invariant-load-handling.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" - -%S = type { i32, i32, [12 x %L] } -%L = type { i32, i32, double, i32, i32, i32, i32, i32 } - -define void @test(ptr %cpi, i1 %b) { -; SCOPS-LABEL: Region: %if.then14---%exit -; SCOPS: Invariant Accesses: { -; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] }; -; SCOPS-NEXT: Execution Context: [l2, l1] -> { : } -; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] }; -; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 } -; SCOPS-NEXT: } -; SCOPS: Arrays { -; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4 -; SCOPS-NEXT: } - -; Check that we gracefully handle failing invariant loads. -; This test case is taken from: -; test/Isl/CodeGen/invariant-load-dimension.ll - -; FIXME: Figure out how to actually generate code for this loop. -; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function - -entry: - %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 - br i1 %b, label %if.then14, label %exit - -if.then14: - %l0 = load i32, ptr %cpi, align 8 - %cmp12.i = icmp sgt i32 %l0, 0 - br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit - -for.body.lr.ph.i: - %l1 = load i32, ptr %nt, align 4 - br label %for.body.i - -for.body.i: - %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] - %mul.i163 = mul nsw i32 %phi, %l1 - %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 - store i32 0, ptr %cv, align 8 - %inc = add nuw nsw i32 %phi, 1 - %l2 = load i32, ptr %cpi, align 8 - %cmp.i164 = icmp slt i32 %inc, %l2 - br i1 %cmp.i164, label %for.body.i, label %exit - -exit: - ret void -} diff --git a/polly/test/GPGPU/failing-invariant-load-hoisting.ll b/polly/test/GPGPU/failing-invariant-load-hoisting.ll deleted file mode 100644 index aa62921e1af5..000000000000 --- a/polly/test/GPGPU/failing-invariant-load-hoisting.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" - -%S = type { i32, i32, [12 x %L] } -%L = type { i32, i32, double, i32, i32, i32, i32, i32 } - -define void @test(ptr %cpi, i1 %b) { -; CODEGEN-LABEL: @test( -; CODEGEN: polly.preload.begin: -; CODEGEN-NEXT: br i1 false - -entry: - %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1 - br i1 %b, label %if.then14, label %exit - -if.then14: - %l0 = load i32, ptr %cpi, align 8 - %cmp12.i = icmp sgt i32 %l0, 0 - br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit - -for.body.lr.ph.i: - %l1 = load i32, ptr %nt, align 4 - br label %for.body.i - -for.body.i: - %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ] - %mul.i163 = mul nsw i32 %phi, %l1 - %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0 - store i32 0, ptr %cv, align 8 - %inc = add nuw nsw i32 %phi, 1 - %l2 = load i32, ptr %cpi, align 8 - %cmp.i164 = icmp slt i32 %inc, %l2 - br i1 %cmp.i164, label %for.body.i, label %exit - -exit: - ret void -} diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll deleted file mode 100644 index 5ba65d60819c..000000000000 --- a/polly/test/GPGPU/host-control-flow.ll +++ /dev/null @@ -1,176 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ -; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \ -; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | FileCheck %s -check-prefix=IR -; void foo(float A[2][100]) { -; for (long t = 0; t < 100; t++) -; for (long i = 1; i < 99; i++) -; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1]; -; } - -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1) -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader -; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] -; ... -; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* -; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] -; IR: call i8* @polly_getKernel -; ... -; IR: call void @polly_freeKernel -; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99 -; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0) -; KERNEL-IR-LABEL: entry: -; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64 -; KERNEL-IR-NEXT: br label %polly.cond - -; KERNEL-IR-LABEL: polly.cond: ; preds = %entry -; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0 -; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97 -; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else - -; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3 -; KERNEL-IR-NEXT: ret void - -; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond -; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0 -; KERNEL-IR-NEXT: br label %polly.stmt.for.body3 - -; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100 -; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8 -; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A -; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100 -; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0 -; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11 -; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4 -; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4 -; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_ -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100 -; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0 -; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14 -; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9 -; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4 -; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_ -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1 -; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100 -; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0 -; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18 -; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14 -; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4 -; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12 -; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1 -; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2 -; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100 -; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0 -; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0 -; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1 -; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22 -; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19 -; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond -; KERNEL-IR-NEXT: br label %polly.merge -; KERNEL-IR-NEXT: } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo([100 x float]* %A) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc18, %entry - %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ] - %exitcond1 = icmp ne i64 %t.0, 100 - br i1 %exitcond1, label %for.body, label %for.end20 - -for.body: ; preds = %for.cond - br label %for.cond1 - -for.cond1: ; preds = %for.inc, %for.body - %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ] - %exitcond = icmp ne i64 %i.0, 99 - br i1 %exitcond, label %for.body3, label %for.end - -for.body3: ; preds = %for.cond1 - %sub = add nsw i64 %i.0, -1 - %rem = srem i64 %t.0, 2 - %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub - %tmp = load float, float* %arrayidx4, align 4 - %rem5 = srem i64 %t.0, 2 - %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0 - %tmp2 = load float, float* %arrayidx7, align 4 - %add = fadd float %tmp, %tmp2 - %add8 = add nuw nsw i64 %i.0, 1 - %rem9 = srem i64 %t.0, 2 - %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8 - %tmp3 = load float, float* %arrayidx11, align 4 - %add12 = fadd float %add, %tmp3 - %add13 = add nuw nsw i64 %t.0, 1 - %rem14 = srem i64 %add13, 2 - %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0 - %tmp4 = load float, float* %arrayidx16, align 4 - %add17 = fadd float %tmp4, %add12 - store float %add17, float* %arrayidx16, align 4 - br label %for.inc - -for.inc: ; preds = %for.body3 - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond1 - -for.end: ; preds = %for.cond1 - br label %for.inc18 - -for.inc18: ; preds = %for.end - %inc19 = add nuw nsw i64 %t.0, 1 - br label %for.cond - -for.end20: ; preds = %for.cond - ret void -} diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll deleted file mode 100644 index d7232b2fa538..000000000000 --- a/polly/test/GPGPU/host-statement.ll +++ /dev/null @@ -1,204 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-invariant-load-hoisting=false \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -polly-invariant-load-hoisting=false \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -declare void @llvm.lifetime.start(i64, ptr nocapture) #0 - -; This test case tests that we can correctly handle a ScopStmt that is -; scheduled on the host, instead of within a kernel. - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(16); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: if (p_0 <= 510 && p_1 <= 510) { -; CODE-NEXT: { -; CODE-NEXT: dim3 k1_dimBlock(32); -; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16); -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k2_dimBlock(16, 32); -; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16); -; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: } -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: Stmt_for_cond33_preheader_last(); - -; CODE: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_for_body16(32 * b0 + t0); - -; CODE: # kernel1 -; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1) -; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) { -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0) -; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510) -; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1) -; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3); -; CODE-NEXT: sync0(); -; CODE-NEXT: } - -; CODE: # kernel2 -; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1) -; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510) -; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3); - -; KERNEL-IR: call void @llvm.nvvm.barrier0() - -; Function Attrs: nounwind uwtable -define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry.split, %for.inc86 - %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ] - %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ] - br label %for.inc - -for.inc: ; preds = %for.cond1.preheader, %for.inc - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] - %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ] - %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 - %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1 - %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24 - %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1 - %mul = fmul double %tmp, %tmp27 - %add = fadd double %nrm.02, %mul - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 512 - br i1 %exitcond, label %for.inc, label %for.end - -for.end: ; preds = %for.inc - %add.lcssa = phi double [ %add, %for.inc ] - %call = tail call double @sqrt(double %add.lcssa) #2 - %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 - store double %call, ptr %arrayidx13, align 8, !tbaa !1 - br label %for.body16 - -for.cond33.preheader: ; preds = %for.body16 - %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1 - %cmp347 = icmp slt i64 %indvars.iv.next25, 512 - br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86 - -for.body35.lr.ph: ; preds = %for.cond33.preheader - br label %for.body35 - -for.body16: ; preds = %for.end, %for.body16 - %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ] - %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24 - %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1 - %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24 - %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1 - %div = fdiv double %tmp28, %tmp29 - %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24 - store double %div, ptr %arrayidx28, align 8, !tbaa !1 - %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 - %exitcond12 = icmp ne i64 %indvars.iv.next11, 512 - br i1 %exitcond12, label %for.body16, label %for.cond33.preheader - -for.cond33.loopexit: ; preds = %for.body62 - %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32 - %exitcond23 = icmp ne i32 %lftr.wideiv, 512 - br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge - -for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit - %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ] - %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1 - br label %for.body42 - -for.cond60.preheader: ; preds = %for.body42 - br label %for.body62 - -for.body42: ; preds = %for.body35, %for.body42 - %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ] - %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24 - %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1 - %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21 - %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1 - %mul51 = fmul double %tmp30, %tmp31 - %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1 - %add56 = fadd double %tmp32, %mul51 - store double %add56, ptr %arrayidx55, align 8, !tbaa !1 - %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 - %exitcond15 = icmp ne i64 %indvars.iv.next14, 512 - br i1 %exitcond15, label %for.body42, label %for.cond60.preheader - -for.body62: ; preds = %for.cond60.preheader, %for.body62 - %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ] - %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 - %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1 - %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24 - %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1 - %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21 - %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1 - %mul75 = fmul double %tmp34, %tmp35 - %sub = fsub double %tmp33, %mul75 - %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21 - store double %sub, ptr %arrayidx79, align 8, !tbaa !1 - %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 - %exitcond18 = icmp ne i64 %indvars.iv.next17, 512 - br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit - -for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit - br label %for.inc86 - -for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond26 = icmp ne i64 %indvars.iv.next25, 512 - br i1 %exitcond26, label %for.cond1.preheader, label %for.end88 - -for.end88: ; preds = %for.inc86 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #0 - -; Function Attrs: nounwind -declare double @sqrt(double) #2 - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"double", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/ignore-parameter-bounds.ll b/polly/test/GPGPU/ignore-parameter-bounds.ll deleted file mode 100644 index 1d0b5482941e..000000000000 --- a/polly/test/GPGPU/ignore-parameter-bounds.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; CODE: Code -; CODE: ==== -; CODE: No code generated - -source_filename = "bugpoint-output-83bcdeb.bc" -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -@__data_radiation_MOD_cobi = external global [168 x double], align 32 - -; Function Attrs: nounwind uwtable -define void @__radiation_rg_MOD_coe_so() #0 { -entry: - %polly.access.kspec.load = load i32, ptr undef, align 4 - %0 = or i1 undef, undef - br label %polly.preload.cond29 - -polly.preload.cond29: ; preds = %entry - br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30 - -polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29 - %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ] - ret void - -polly.preload.exec31: ; preds = %polly.preload.cond29 - %1 = sext i32 %polly.access.kspec.load to i64 - %2 = mul nsw i64 7, %1 - %3 = add nsw i64 0, %2 - %4 = add nsw i64 %3, 48 - %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4 - %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8 - br label %polly.preload.merge30 -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll deleted file mode 100644 index 7c1e3672abb5..000000000000 --- a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR - -; Test that we do recognise and codegen a kernel that has intrinsics. - -; REQUIRES: pollyacc - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end - -; Check that the intrinsic call is present in the kernel IR. -; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_) -; KERNEL-IR: declare float @llvm.sqrt.f32(float) -; KERNEL-IR: declare float @llvm.fabs.f32(float) - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) - - -; void f(float *A, float *B, int N) { -; for(int i = 0; i < N; i++) { -; float tmp0 = A[i]; -; float tmp1 = sqrt(tmp1); -; float tmp2 = fabs(tmp2); -; float tmp3 = copysignf(tmp1, tmp2); -; B[i] = tmp4; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(float* %A, float* %B, i32 %N) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp1 = icmp sgt i32 %N, 0 - br i1 %cmp1, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv - %A.arr.i.val = load float, float* %A.arr.i, align 4 - ; Call to intrinsics that should be part of the kernel. - %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val) - %fabs = tail call float @llvm.fabs.f32(float %sqrt); - %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs); - %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv - store float %copysign, float* %B.arr.i, align 4 - - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %N to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.sqrt.f32(float) #0 -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.copysign.f32(float, float) #0 - -attributes #0 = { nounwind readnone } - diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll deleted file mode 100644 index 4b9139f0b44c..000000000000 --- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ -; RUN: -disable-output < %s - -; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually -; fail on an illegal module. - -; REQUIRES: pollyacc, asserts -; XFAIL: * -; -; void foo(long A[1024], long B[1024]) { -; for (long i = 0; i < 1024; i++) -; A[i] += (B[i] + (long)&B[i]); -; } - - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb10, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb12 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 8 - %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp5 = ptrtoint ptr %tmp4 to i64 - %tmp6 = add nsw i64 %tmp3, %tmp5 - %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp8 = load i64, ptr %tmp7, align 8 - %tmp9 = add nsw i64 %tmp8, %tmp6 - store i64 %tmp9, ptr %tmp7, align 8 - br label %bb10 - -bb10: ; preds = %bb2 - %tmp11 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb12: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll deleted file mode 100644 index 9dd32eac97c0..000000000000 --- a/polly/test/GPGPU/invalid-kernel.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: not FileCheck %s -check-prefix=KERNEL-IR - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; REQUIRES: pollyacc -; -; void foo(long A[1024], long B[1024]) { -; for (long i = 0; i < 1024; i++) -; A[i] += (B[i] + (long)&B[i]); -; } - -; This kernel loads/stores a pointer address we model. This is a rare case, -; were we still lack proper code-generation support. We check here that we -; detect the invalid IR and bail out gracefully. - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; KERNEL-IR: kernel - -; IR: br i1 false, label %polly.start, label %bb1 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb10, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb12 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 8 - %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0 - %tmp5 = ptrtoint ptr %tmp4 to i64 - %tmp6 = add nsw i64 %tmp3, %tmp5 - %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp8 = load i64, ptr %tmp7, align 8 - %tmp9 = add nsw i64 %tmp8, %tmp6 - store i64 %tmp9, ptr %tmp7, align 8 - br label %bb10 - -bb10: ; preds = %bb2 - %tmp11 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb12: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/invariant-load-array-access.ll b/polly/test/GPGPU/invariant-load-array-access.ll deleted file mode 100644 index 02c0330a7e7e..000000000000 --- a/polly/test/GPGPU/invariant-load-array-access.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - - -; REQUIRES: pollyacc - -; Check that we detect a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] }; -; SCOP-NEXT: Execution Context: [tmp] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] }; -; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 } -; SCOP-NEXT: } - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; This test makes sure that such an access pattern is handled correctly -; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads` -; was the main reason that caused this test case to crash. -; -; void f(int *arr, const int *control, const int *readarr) { -; for(int i = 0; i < 1000; i++) { -; int t = 0; -; if (*control > 3) { -; t += *readarr; -; } -; arr[i] = t; -; } -; } - - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" -define void @f(ptr %arr, ptr %control, ptr %readarr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ] - %tmp = load i32, ptr %control, align 4 - %cmp1 = icmp sgt i32 %tmp, 3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - %tmp1 = load i32, ptr %readarr, align 4 - br label %if.end - -if.end: ; preds = %if.then, %for.body - %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01 - store i32 %t.0, ptr %arrayidx, align 4 - %inc = add nuw nsw i32 %i.01, 1 - %exitcond = icmp eq i32 %inc, 1000 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/invariant-load-escaping-values.ll b/polly/test/GPGPU/invariant-load-escaping-values.ll deleted file mode 100644 index 54f4b43fdb92..000000000000 --- a/polly/test/GPGPU/invariant-load-escaping-values.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: opt %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: store i64 %polly.access.B.load, ptr %invariant.preload.s2a -; CHECK: %invariant.final_reload = load i64, ptr %invariant.preload.s2a - -; Verify that the final reload of an invariant scalar memory access uses the -; same stack slot that into which the invariant memory access was stored -; originally. Earlier, this was broken as we introduce a new stack slot aside -; of the preload stack slot, which remained uninitialized and caused our escaping -; loads to contain garbage. - -define i64 @foo(ptr %A, ptr %B) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add nsw i64 %indvar, 1 - %idx = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %idx - %invariant = load i64, ptr %B - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %exit - -exit: - ret i64 %invariant -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll deleted file mode 100644 index 015a3dacbe10..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Entry: Contains (%loaded.ptr.preload.s2a = alloca double*) which is -; | invariant load hoisted `%loaded.ptr` -; v -; Run-time check --(failure branch)--> { old code - contains `%loaded.ptr` } -; | -; (success branch) -; | -; v -; New Code: Should refer to `%loaded.ptr.preload.s2a`, which is -; the invariant load hoisted value, NOT `%loaded.ptr`. - -; In Polly, we preserve the old code and create a separate branch that executes -; the GPU code if a run-time check succeeds. - -; We need to make sure that in the new branch, we pick up invariant load hoisted -; values. The old values will belong to the old code branch. - -; In this case, we use to try to load the 'original' %loaded.ptr in the -; 'New Code' branch,which is wrong. Check that this does not happen. - -; Check that we have a Scop with an invariant load of the array. -; SCOP: Function: f -; SCOP-NEXT: Region: %arrload---%for.exit -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: { Stmt_arrload[] -> MemRef_arr_of_ptrs[0] }; - - - -; Check that we have the preloaded array. -; HOST-IR: entry: -; HOST-IR-NEXT: %loaded.ptr.preload.s2a = alloca double* - -; Chek that we store the correct value in the preload. -; polly.preload.begin: ; preds = %polly.split_new_and_old -; HOST-IR: %polly.access.arr.of.ptrs = getelementptr double*, double** %arr.of.ptrs, i64 0 -; HOST-IR-NEXT: %polly.access.arr.of.ptrs.load = load double*, double** %polly.access.arr.of.ptrs -; HOST-IR-NEXT: store double* %polly.access.arr.of.ptrs.load, double** %loaded.ptr.preload.s2a - -; Check that we get back data from the kernel. -; HOST-IR: polly.acc.initialize: ; preds = %polly.start -; HOST-IR: [[FIRSTINDEX:%.+]] = getelementptr double, double* %polly.access.arr.of.ptrs.load, i64 1 -; HOST-IR: [[BITCASTED:%.+]] = bitcast double* [[FIRSTINDEX]] to i8* -; HOST-IR: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_loaded_ptr, i8* [[BITCASTED]], i64 800) - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*) - - -; C pseudocode equivalent -; void f(double **arr_of_ptrs) { -; double *loaded_ptr = arr_of_ptrs[0]; -; if (false) { return; } -; else { -; for(int i = 1; i < 100; i++) { -; loaded_ptr[i] = 42.0; -; } -; } -; } - - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - - -; Function Attrs: nounwind uwtable -define void @f(double **%arr.of.ptrs) #0 { -entry: - br label %arrload - -arrload: ; preds = %"7" - %loaded.ptr = load double*, double** %arr.of.ptrs, align 8 - br i1 false, label %"for.exit", label %"for.preheader" - -"for.preheader": ; preds = %"51" - br label %"for.body" - -"for.body": ; preds = %"53", %"53.lr.ph" - %indvar = phi i64 [ 1, %"for.preheader" ], [ %indvar.next, %"for.body" ] - %slot = getelementptr double, double* %loaded.ptr, i64 %indvar - store double 42.0, double* %slot, align 8 - - %indvar.next = add nuw nsw i64 %indvar, 1 - - %check = icmp sgt i64 %indvar.next, 100 - br i1 %check, label %"for.exit", label %"for.body" - -"for.exit": ; preds = %"52.54_crit_edge", %"51" - ret void -} - -attributes #0 = { nounwind uwtable } diff --git a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll deleted file mode 100644 index ad30ef6f9b24..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=HOST-IR %s - -; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \ -; RUN: -polly-codegen-ppcg -polly-scops \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; Verify that invariant loads used in a kernel statement are correctly forwarded -; as subtree value to the GPU kernel. - -; HOST-IR: store float %polly.access.p.load, ptr %invariant.preload.s2a, align 4 - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}ptr addrspace(1) %MemRef_indvar2f__phi{{.*}}) -; KERNEL-IR: %indvar2f.phiops.reload = load float, ptr %indvar2f.phiops, align 4 -; KERNEL-IR: store float %indvar2f.phiops.reload, ptr addrspace(1) %polly.access.MemRef_A, align 4 - -; FIXME: store float %indvar2f.phiops.reload, ptr %indvar2f.phiops, align 4 -; For some reason the above instruction is emitted that stores back to the addess it was just loaded from. - -define void @foo(ptr %A, ptr %p) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add i64 %indvar, 1 - %invariant = load float, ptr %p - %ptr = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %ptr - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %anotherloop - -anotherloop: - %indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop] - %indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop] - %indvar2.next = add i64 %indvar2, 1 - store float %indvar2f, ptr %A - %cmp2 = icmp sle i64 %indvar2, 1024 - br i1 %cmp2, label %anotherloop, label %end - -end: - ret void - -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll deleted file mode 100644 index 7a650eeb22ee..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp1, tmp4] -> { : } -; SCOP-NEXT: } - - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; void f(int *begin, int *end, int *arr) { -; for (int i = *begin; i < *end; i++) { -; arr[i] = 0; -; } -; } -; - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" - -define void @f(ptr %begin, ptr %end, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp1 = load i32, ptr %begin, align 4 - %tmp41 = load i32, ptr %end, align 4 - %cmp2 = icmp slt i32 %tmp1, %tmp41 - br i1 %cmp2, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.03 - store i32 0, ptr %arrayidx, align 4 - %inc = add nsw i32 %i.03, 1 - %tmp4 = load i32, ptr %end, align 4 - %cmp = icmp slt i32 %inc, %tmp4 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll deleted file mode 100644 index a637cc44c7a3..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop with invariant accesses. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [beginval] -> { : } -; SCOP-NEXT: } - -; Check that the kernel launch is generated in the host IR. -; This declaration would not have been generated unless a kernel launch exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; -; void f(int *begin, int *arr) { -; for (int i = *begin; i < 100; i++) { -; arr[i] = 0; -; } -; } - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" - -define void @f(ptr %begin, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %beginval = load i32, ptr %begin, align 4 - %cmp1 = icmp slt i32 %beginval, 100 - br i1 %cmp1, label %for.body, label %for.end - - - -for.body: ; preds = %for.body.lr.ph, %for.body - %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %ival - store i32 0, ptr %arrayidx, align 4 - %inc = add nsw i32 %ival, 1 - %cmp = icmp slt i32 %ival, 99 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll deleted file mode 100644 index 3c19a306734a..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; Check that we detect a scop with invariant accesses. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp2] -> { Stmt_for_body[i0] -> MemRef_idx[0] }; -; SCOP-NEXT: Execution Context: [tmp2] -> { : } -; SCOP-NEXT: } - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; Check if we generate GPU code for simple loop with variable upper bound. -; This always worked, but have this test to prevent regressions. -; void f(int *idx, int *arr) { -; for (int i = 0; i < *idx; i++) { -; arr[i] = 0; -; } -; } -; -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %idx, ptr %arr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp21 = load i32, ptr %idx, align 4 - %cmp2 = icmp sgt i32 %tmp21, 0 - br i1 %cmp2, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv - store i32 0, ptr %arrayidx, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %tmp2 = load i32, ptr %idx, align 4 - %0 = sext i32 %tmp2 to i64 - %cmp = icmp slt i64 %indvars.iv.next, %0 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll deleted file mode 100644 index 5ae1cfae255d..000000000000 --- a/polly/test/GPGPU/invariant-load-hoisting.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; -; RUN: opt %loadPolly -polly-scops -S -polly-invariant-load-hoisting \ -; RUN: -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR -; -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=KERNEL-IR -; -; REQUIRES: pollyacc -; -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end26 -; SCOP-NEXT: Max Loop Depth: 3 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; -; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } -; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(ptr %[[REGC:[0-9]+]], i32 %{{[0-9]+}}, i32 1, i32 32, i32 1, i32 1, ptr %polly_launch_0_params_i8ptr) -; HOST-IR-NEXT: call void @polly_freeKernel(ptr %[[REGC]]) - -; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_B, ptr addrspace(1) %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) - - -; Check that we generate correct GPU code in case of invariant load hoisting. -; -; -; static const int N = 3000; -; -; void f(int A[N][N], int *invariant, int B[N][N], int n) { -; for (int i = 0; i < n; i++) { -; for (int j = 0; j < n; j++) { -; for (int k = 0; k < n; k++) { -; -; A[*invariant][k] = B[k][k]; -; A[k][*invariant] += B[k][k]; -; } -; } -; } -; } -; - -define void @f(ptr %A, ptr %invariant, ptr %B, i32 %n) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp6 = icmp sgt i32 %n, 0 - br i1 %cmp6, label %for.cond1.preheader.lr.ph, label %for.end26 - -for.cond1.preheader.lr.ph: ; preds = %entry.split - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc24 - %i.07 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc25, %for.inc24 ] - %cmp23 = icmp sgt i32 %n, 0 - br i1 %cmp23, label %for.cond4.preheader.lr.ph, label %for.inc24 - -for.cond4.preheader.lr.ph: ; preds = %for.cond1.preheader - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %for.cond4.preheader.lr.ph, %for.inc21 - %j.04 = phi i32 [ 0, %for.cond4.preheader.lr.ph ], [ %inc22, %for.inc21 ] - %cmp51 = icmp sgt i32 %n, 0 - br i1 %cmp51, label %for.body6.lr.ph, label %for.inc21 - -for.body6.lr.ph: ; preds = %for.cond4.preheader - br label %for.body6 - -for.body6: ; preds = %for.body6.lr.ph, %for.body6 - %k.02 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ] - %idxprom = sext i32 %k.02 to i64 - %idxprom7 = sext i32 %k.02 to i64 - %arrayidx8 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom, i64 %idxprom7 - %tmp9 = load i32, ptr %arrayidx8, align 4 - %tmp12 = load i32, ptr %invariant, align 4 - %idxprom9 = sext i32 %tmp12 to i64 - %idxprom11 = sext i32 %k.02 to i64 - %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom9, i64 %idxprom11 - store i32 %tmp9, ptr %arrayidx12, align 4 - %idxprom13 = sext i32 %k.02 to i64 - %idxprom15 = sext i32 %k.02 to i64 - %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom13, i64 %idxprom15 - %tmp17 = load i32, ptr %arrayidx16, align 4 - %idxprom17 = sext i32 %k.02 to i64 - %tmp21 = load i32, ptr %invariant, align 4 - %idxprom19 = sext i32 %tmp21 to i64 - %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom17, i64 %idxprom19 - %tmp22 = load i32, ptr %arrayidx20, align 4 - %add = add nsw i32 %tmp22, %tmp17 - store i32 %add, ptr %arrayidx20, align 4 - %inc = add nuw nsw i32 %k.02, 1 - %cmp5 = icmp slt i32 %inc, %n - br i1 %cmp5, label %for.body6, label %for.cond4.for.inc21_crit_edge - -for.cond4.for.inc21_crit_edge: ; preds = %for.body6 - br label %for.inc21 - -for.inc21: ; preds = %for.cond4.for.inc21_crit_edge, %for.cond4.preheader - %inc22 = add nuw nsw i32 %j.04, 1 - %cmp2 = icmp slt i32 %inc22, %n - br i1 %cmp2, label %for.cond4.preheader, label %for.cond1.for.inc24_crit_edge - -for.cond1.for.inc24_crit_edge: ; preds = %for.inc21 - br label %for.inc24 - -for.inc24: ; preds = %for.cond1.for.inc24_crit_edge, %for.cond1.preheader - %inc25 = add nuw nsw i32 %i.07, 1 - %cmp = icmp slt i32 %inc25, %n - br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end26_crit_edge - -for.cond.for.end26_crit_edge: ; preds = %for.inc24 - br label %for.end26 - -for.end26: ; preds = %for.cond.for.end26_crit_edge, %entry.split - ret void -} diff --git a/polly/test/GPGPU/invariant-load-of-scalar.ll b/polly/test/GPGPU/invariant-load-of-scalar.ll deleted file mode 100644 index fbc1d4d7ecee..000000000000 --- a/polly/test/GPGPU/invariant-load-of-scalar.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck -check-prefix=SCOP %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=HOST-IR %s - - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; Check that we offload invariant loads of scalars correctly. - -; Check that invariant loads are present. -; SCOP: Function: checkPrivatization -; SCOP-NEXT: Region: %entry.split---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP-NEXT: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp2] -> { Stmt_entry_split[] -> MemRef_begin[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: [tmp, tmp2] -> { Stmt_for_body[i0] -> MemRef_end[0] }; -; SCOP-NEXT: Execution Context: [tmp, tmp2] -> { : } -; SCOP-NEXT: } -; - -; Check that we do not actually allocate arrays for %begin, %end, since they are -; invariant load hoisted. -; HOST-IR: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice -; HOST-IR-NOT: call ptr @polly_allocateMemoryForDevice - -; Check that we send the invariant loaded scalars as parameters to the -; kernel function. -; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0 -; KERNEL-IR-SAME: (ptr addrspace(1) %MemRef_A, i32 %tmp, -; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load) - - -; void checkScalarPointerOffload(int A[], int *begin, int *end) { -; for(int i = *begin; i < *end; i++) { -; A[i] = 10; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define void @checkPrivatization(ptr %A, ptr %begin, ptr %end) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp = load i32, ptr %begin, align 4 - %tmp21 = load i32, ptr %end, align 4 - %cmp3 = icmp slt i32 %tmp, %tmp21 - br i1 %cmp3, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - %tmp1 = sext i32 %tmp to i64 - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv4 = phi i64 [ %tmp1, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv4 - store i32 10, ptr %arrayidx, align 4 - %indvars.iv.next = add i64 %indvars.iv4, 1 - %tmp2 = load i32, ptr %end, align 4 - %tmp3 = sext i32 %tmp2 to i64 - %cmp = icmp slt i64 %indvars.iv.next, %tmp3 - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll deleted file mode 100644 index 87ae470e29bc..000000000000 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ /dev/null @@ -1,106 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc -; -; void kernel_params_only_some_arrays(float A[], float B[]) { -; for (long i = 0; i < 32; i++) -; A[i] += 42; -; -; for (long i = 0; i < 32; i++) -; B[i] += 42; -; } - -; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0' -; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0" -; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" - -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-NEXT: %t0 = zext i32 %1 to i64 - -; KERNEL: ret void -; KERNEL-NEXT: } - -; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1' -; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1" -; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda" - -; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() -; KERNEL-NEXT: %b0 = zext i32 %0 to i64 -; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -; KERNEL-NEXT: %t0 = zext i32 %1 to i64 - -; KERNEL: ret void -; KERNEL-NEXT: } - - -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 -; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 -; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* -; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] - -; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 -; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 -; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* -; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @kernel_params_only_some_arrays(float* %A, float* %B) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp = load float, float* %arrayidx, align 4 - %add = fadd float %tmp, 4.200000e+01 - store float %add, float* %arrayidx, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond - -for.end: ; preds = %for.cond - br label %for.cond2 - -for.cond2: ; preds = %for.inc7, %for.end - %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ] - %exitcond = icmp ne i64 %i1.0, 32 - br i1 %exitcond, label %for.body4, label %for.end9 - -for.body4: ; preds = %for.cond2 - %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0 - %tmp2 = load float, float* %arrayidx5, align 4 - %add6 = fadd float %tmp2, 4.200000e+01 - store float %add6, float* %arrayidx5, align 4 - br label %for.inc7 - -for.inc7: ; preds = %for.body4 - %inc8 = add nuw nsw i64 %i1.0, 1 - br label %for.cond2 - -for.end9: ; preds = %for.cond2 - ret void -} diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll deleted file mode 100644 index 527492bfd5fb..000000000000 --- a/polly/test/GPGPU/kernel-params-scop-parameter.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL-IR %s - -; REQUIRES: pollyacc - -; void kernel_params_scop_parameter(float A[], long n) { -; for (long i = 0; i < n; i++) -; A[i] += 42; -; } - -; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @kernel_params_scop_parameter(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp4 = load float, ptr %tmp3, align 4 - %tmp5 = fadd float %tmp4, 4.200000e+01 - store float %tmp5, ptr %tmp3, align 4 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll deleted file mode 100644 index 57fe70ec0d9b..000000000000 --- a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll +++ /dev/null @@ -1,124 +0,0 @@ -; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { -; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { -; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 { - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; Function Attrs: nounwind uwtable -define void @foo(i32 %arg, ptr %arg1) #0 { -bb: - br label %bb2 - -bb2: ; preds = %bb - %tmp = icmp sgt i32 %arg, 0 - br i1 %tmp, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb4, %bb3 - %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] - %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 - %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 - %tmp8 = add nsw i32 %tmp7, 1 - store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 - %tmp9 = add nuw nsw i64 %tmp5, 1 - %tmp10 = zext i32 %arg to i64 - %tmp11 = icmp ne i64 %tmp9, %tmp10 - br i1 %tmp11, label %bb4, label %bb12 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12, %bb2 - %tmp14 = tail call i64 @clock() #3 - %tmp15 = icmp eq i64 %tmp14, 0 - br i1 %tmp15, label %bb16, label %bb29 - -bb16: ; preds = %bb13 - %tmp17 = icmp sgt i32 %arg, 0 - br i1 %tmp17, label %bb18, label %bb28 - -bb18: ; preds = %bb16 - br label %bb19 - -bb19: ; preds = %bb19, %bb18 - %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ] - %tmp21 = getelementptr inbounds i32, ptr %arg1, i64 %tmp20 - %tmp22 = load i32, ptr %tmp21, align 4, !tbaa !2 - %tmp23 = add nsw i32 %tmp22, 1 - store i32 %tmp23, ptr %tmp21, align 4, !tbaa !2 - %tmp24 = add nuw nsw i64 %tmp20, 1 - %tmp25 = zext i32 %arg to i64 - %tmp26 = icmp ne i64 %tmp24, %tmp25 - br i1 %tmp26, label %bb19, label %bb27 - -bb27: ; preds = %bb19 - br label %bb28 - -bb28: ; preds = %bb27, %bb16 - br label %bb29 - -bb29: ; preds = %bb28, %bb13 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1 - -; Function Attrs: nounwind -declare i64 @clock() #2 - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1 - -; Function Attrs: nounwind uwtable -define void @foo2(i32 %arg, ptr %arg1) #0 { -bb: - br label %bb2 - -bb2: ; preds = %bb - %tmp = icmp sgt i32 %arg, 0 - br i1 %tmp, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb4, %bb3 - %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ] - %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5 - %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2 - %tmp8 = add nsw i32 %tmp7, 1 - store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2 - %tmp9 = add nuw nsw i64 %tmp5, 1 - %tmp10 = zext i32 %arg to i64 - %tmp11 = icmp ne i64 %tmp9, %tmp10 - br i1 %tmp11, label %bb4, label %bb12 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12, %bb2 - ret void -} - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind } - -!llvm.module.flags = !{!0} -!llvm.ident = !{!1} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{!"clang version 5.0.0"} -!2 = !{!3, !3, i64 0} -!3 = !{!"int", !4, i64 0} -!4 = !{!"omnipotent char", !5, i64 0} -!5 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll deleted file mode 100644 index 0f8405dad7e8..000000000000 --- a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll +++ /dev/null @@ -1,89 +0,0 @@ -; RUN: opt %loadPolly -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s \ -; RUN: -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll \ -; RUN: | FileCheck %s --check-prefix=HOST-IR - -; Test that we do recognise and codegen a kernel that has functions that can -; be mapped to NVIDIA's libdevice - -; REQUIRES: pollyacc - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end - -; Check that the intrinsic call is present in the kernel IR. -; KERNEL-IR: %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_) -; KERNEL-IR: %p_cosf = tail call float @__nv_cosf(float %p_expf) -; KERNEL-IR: %p_logf = tail call float @__nv_logf(float %p_cosf) - -; Powi and exp cannot be lowered directly. Rather, we expect them to be -; lowered by libdevice. -; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2) -; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi) - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - - -; void f(float *A, float *B, int N) { -; for(int i = 0; i < N; i++) { -; float tmp0 = A[i]; -; float expf = expf(tmp1); -; cosf = cosf(expf); -; logf = logf(cosf); -; powi = powi(logf, 2); -; exp = exp(powi); -; B[i] = logf; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %A, ptr %B, i32 %N) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp1 = icmp sgt i32 %N, 0 - br i1 %cmp1, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.body - %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %A.arr.i = getelementptr inbounds float, ptr %A, i64 %indvars.iv - %A.arr.i.val = load float, ptr %A.arr.i, align 4 - ; Call to intrinsics that should be part of the kernel. - %expf = tail call float @expf(float %A.arr.i.val) - %cosf = tail call float @cosf(float %expf) - %logf = tail call float @logf(float %cosf) - %powi = tail call float @llvm.powi.f32.i32(float %logf, i32 2) - %exp = tail call float @llvm.exp.f32(float %powi) - %B.arr.i = getelementptr inbounds float, ptr %B, i64 %indvars.iv - store float %exp, ptr %B.arr.i, align 4 - - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %N to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - -; Function Attrs: nounwind readnone -declare float @expf(float) #0 -declare float @cosf(float) #0 -declare float @logf(float) #0 -declare float @llvm.powi.f32.i32(float, i32) #0 -declare float @llvm.exp.f32(float) #0 - -attributes #0 = { nounwind readnone } - diff --git a/polly/test/GPGPU/live-range-reordering-with-privatization.ll b/polly/test/GPGPU/live-range-reordering-with-privatization.ll deleted file mode 100644 index 3b047fd557ff..000000000000 --- a/polly/test/GPGPU/live-range-reordering-with-privatization.ll +++ /dev/null @@ -1,78 +0,0 @@ - ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-code -disable-output \ -; RUN: < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \ -; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-kernel-ir -disable-output \ -; RUN: < %s | FileCheck %s -check-prefix=KERNELIR - -; REQUIRES: pollyacc - -; void f(const int *end, int *arr, const int *control, const int *readarr) { -; for (int i = 0; i < *end; i++) { -; int t = 0; -; if (*control > 3) { -; t += readarr[i]; -; } -; arr[i] = t; -; } -; } - -; This test case tests the ability to infer that `t` is local to each loop -; iteration, and can therefore be privatized. - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) { -; CODE-NEXT: Stmt_for_body_last(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: if (tmp1 >= 4) -; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0); -; CODE-NEXT: } - -; KERNELIR: %private_array = alloca i32 - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" - -define void @f(ptr %end, ptr %arr, ptr %control, ptr %readarr) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %tmp3 = load i32, ptr %end, align 4 - %cmp4 = icmp sgt i32 %tmp3, 0 - br i1 %cmp4, label %for.body.lr.ph, label %for.end - -for.body.lr.ph: ; preds = %entry.split - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %if.end - %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ] - %tmp1 = load i32, ptr %control, align 4 - %cmp1 = icmp sgt i32 %tmp1, 3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %readarr, i32 %i.05 - %tmp2 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %if.then, %for.body - %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i32 %i.05 - store i32 %t.0, ptr %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.05, 1 - %tmp = load i32, ptr %end, align 4 - %cmp = icmp slt i32 %inc, %tmp - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %if.end - br label %for.end - -for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split - ret void -} - diff --git a/polly/test/GPGPU/loops-outside-scop.ll b/polly/test/GPGPU/loops-outside-scop.ll deleted file mode 100644 index 36b3a706338a..000000000000 --- a/polly/test/GPGPU/loops-outside-scop.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; There is no FileCheck because we want to make sure that this doesn't crash. -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \ -; RUN: -disable-output < %s - -; REQUIRES: pollyacc - -; Due to the existence of the `fence` call, We can only detect the inner loop -; and not the outer loop. PPCGCodeGeneration had not implemented this case. -; The fix was to pull the implementation from `IslNodeBuilder. - -; Make sure that we only capture the inner loop -; SCOP: Function: f -; SCOP-NEXT: Region: %for2.body---%for2.body.fence -; SCOP-NEXT: Max Loop Depth: 1 - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -declare void @fn_to_fence(ptr %val) - -; void f(int *arr, bool shouldcont) { -; for(int i = 0; ; i++) { -; for(int j = 0; j < 10; j++) { -; arr[j] = i; -; } -; fence(arr); -; if (!shouldcont) break; -; } -; } - - -; Function Attrs: nounwind uwtable -define void @f(ptr %arr, i1 %shouldcont) #1 { -entry: - br label %for.init - -for.init: ; preds = %for.end, %entry.split - %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ] - br label %for2.body - -for2.body: ; preds = %"65", %"64" - %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ] - %j.sext = sext i32 %j to i64 - %arr.slot = getelementptr i32, ptr %arr, i64 %j.sext - store i32 %i, ptr %arr.slot, align 4 - %exitcond = icmp eq i32 %j, 10 - %j.next = add i32 %j, 1 - br i1 %exitcond, label %for2.body.fence, label %for2.body - -for2.body.fence: ; preds = %"65" - call void @fn_to_fence(ptr %arr) #2 - br i1 %shouldcont, label %for.end, label %exit -for.end: ; preds = %"69" - %i.next = add i32 %i, 1 - br label %for.init - -exit: ; preds = %"69" - ret void - -} - - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable } -attributes #2 = { nounwind } diff --git a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll deleted file mode 100644 index 6dbd87db5eb5..000000000000 --- a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ -; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP: i32 MemRef_arr[*]; - -; Check that we generate a constructor call for @A.toptr -; HOST-IR-NOT: %arr = alloca [100 x i32] - -source_filename = "test.c" -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - - -define void @f() { -entry: - %arr = alloca [100 x i32] - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [100 x i32], ptr %arr, i64 0, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll deleted file mode 100644 index 946da40919ec..000000000000 --- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR -; -; REQUIRES: pollyacc -; -; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and -; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` -; pass, even inside `constantExpr`. This is necessary because a cookie cutter -; Inst->replaceUsesOfWith(...) call does not actually work, because this does -; not replace the instruction within a ConstantExpr. -; -; #include <memory.h> -; -; static const int N = 100; -; int* f(int *ToFree) { -; free(ToFree); -; int *A = (int *)malloc(sizeof(int) * N); -; for(int i = 0; i < N; i++) { -; A[i] = 42; -; } -; return A; -; -; } - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; SCOP: Arrays { -; SCOP-NEXT: i32 MemRef_tmp[*]; // Element size 4 -; SCOP-NEXT: } - -; // Check that polly_mallocManaged is declared and used correctly. -; HOST-IR: declare ptr @polly_mallocManaged(i64) - -; // Check that polly_freeManaged is declared and used correctly. -; HOST-IR call void @polly_freeManaged(i8* %toFree) -; HOST-IR: declare void @polly_freeManaged(ptr) - -; // Check that we remove the original malloc,free -; HOST-IR-NOT: declare ptr @malloc(i64) -; HOST-IR-NOT: declare void @free(ptr) - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define ptr @f(ptr %toFree) { -entry: - ; Free inside bitcast - call void @free (ptr %toFree) - br label %entry.split - -entry.split: ; preds = %entry - ; malloc inside bitcast. - %tmp = call ptr @malloc (i64 400) - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %tmp, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret ptr %tmp -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - -declare ptr @malloc(i64) -declare void @free(ptr) - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll deleted file mode 100644 index 8e456127b127..000000000000 --- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR -; -; REQUIRES: pollyacc -; -; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and -; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` -; pass. -; -; #include <memory.h> -; -; static const int N = 100; -; int* f(int *ToFree) { -; free(ToFree); -; int *A = (int *)malloc(sizeof(int) * N); -; for(int i = 0; i < N; i++) { -; A[i] = 42; -; } -; return A; -; -; } - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; SCOP: Arrays { -; SCOP-NEXT: i32 MemRef_call[*]; // Element size 4 -; SCOP-NEXT: } - -; // Check that polly_mallocManaged is declared and used correctly. -; HOST-IR: %call = tail call ptr @polly_mallocManaged(i64 400) -; HOST-IR: declare ptr @polly_mallocManaged(i64) - -; // Check that polly_freeManaged is declared and used correctly. -; HOST-IR %toFreeBitcast = bitcast i32* %toFree to i8* -; HOST-IR call void @polly_freeManaged(i8* %toFreeBitcast) -; HOST-IR: declare void @polly_freeManaged(ptr) - -; // Check that we remove the original malloc,free -; HOST-IR-NOT: declare ptr @malloc(i64) -; HOST-IR-NOT: declare void @free(ptr) - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -define ptr @f(ptr %toFree) { -entry: - call void @free(ptr %toFree) - br label %entry.split - -entry.split: ; preds = %entry - %call = tail call ptr @malloc(i64 400) - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret ptr %call -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - -declare ptr @malloc(i64) -declare void @free(ptr) - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/memory-only-referenced-from-access.ll b/polly/test/GPGPU/memory-only-referenced-from-access.ll deleted file mode 100644 index b3828950324a..000000000000 --- a/polly/test/GPGPU/memory-only-referenced-from-access.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \ -; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \ -; RUN: -polly-acc-fail-on-verify-module-failure \ -; RUN: -polly-acc-codegen-managed-memory \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; Verify that we correctly generate a kernel even if certain invariant load -; hoisted parameters appear only in memory accesses, but not domain elements. - -; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2) - -target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -target triple = "x86_64-unknown-linux-gnu" - -%struct.hoge = type { ptr, i64, i64, [1 x %struct.widget] } -%struct.widget = type { i64, i64, i64 } - -@global = external unnamed_addr global %struct.hoge, align 32 - -define void @quux(ptr noalias %arg, ptr noalias %arg1) { -bb: - %tmp = load i32, ptr %arg, align 4 - %tmp2 = sext i32 %tmp to i64 - %tmp3 = load i32, ptr %arg1, align 4 - %tmp4 = load ptr, ptr @global, align 32 - br label %bb5 - -bb5: ; preds = %bb5, %bb - %tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ] - %tmp7 = sext i32 %tmp6 to i64 - %tmp8 = sub nsw i64 %tmp7, %tmp2 - %tmp9 = getelementptr [0 x double], ptr %tmp4, i64 0, i64 %tmp8 - store double undef, ptr %tmp9, align 8 - %tmp10 = icmp eq i32 %tmp6, %tmp3 - %tmp11 = add i32 %tmp6, 1 - br i1 %tmp10, label %bb12, label %bb5 - -bb12: ; preds = %bb5 - ret void -} diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll deleted file mode 100644 index c42c24482a38..000000000000 --- a/polly/test/GPGPU/mostly-sequential.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; void foo(float A[]) { -; for (long i = 0; i < 128; i++) -; A[i] += i; -; -; for (long i = 0; i < 128; i++) -; for (long j = 0; j < 128; j++) -; A[42] += i + j; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock; -; CODE-NEXT: dim3 k1_dimGrid; -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb4(32 * b0 + t0); - -; CODE: # kernel1 -; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1) -; CODE-NEXT: for (int c1 = 0; c1 <= 127; c1 += 1) -; CODE-NEXT: Stmt_bb14(c0, c1); - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A) { -bb: - br label %bb3 - -bb3: ; preds = %bb8, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] - %exitcond2 = icmp ne i64 %i.0, 128 - br i1 %exitcond2, label %bb4, label %bb10 - -bb4: ; preds = %bb3 - %tmp = sitofp i64 %i.0 to float - %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp6 = load float, ptr %tmp5, align 4 - %tmp7 = fadd float %tmp6, %tmp - store float %tmp7, ptr %tmp5, align 4 - br label %bb8 - -bb8: ; preds = %bb4 - %tmp9 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb10: ; preds = %bb3 - br label %bb11 - -bb11: ; preds = %bb23, %bb10 - %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ] - %exitcond1 = icmp ne i64 %i1.0, 128 - br i1 %exitcond1, label %bb12, label %bb25 - -bb12: ; preds = %bb11 - br label %bb13 - -bb13: ; preds = %bb20, %bb12 - %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ] - %exitcond = icmp ne i64 %j.0, 128 - br i1 %exitcond, label %bb14, label %bb22 - -bb14: ; preds = %bb13 - %tmp15 = add nuw nsw i64 %i1.0, %j.0 - %tmp16 = sitofp i64 %tmp15 to float - %tmp17 = getelementptr inbounds float, ptr %A, i64 42 - %tmp18 = load float, ptr %tmp17, align 4 - %tmp19 = fadd float %tmp18, %tmp16 - store float %tmp19, ptr %tmp17, align 4 - br label %bb20 - -bb20: ; preds = %bb14 - %tmp21 = add nuw nsw i64 %j.0, 1 - br label %bb13 - -bb22: ; preds = %bb13 - br label %bb23 - -bb23: ; preds = %bb22 - %tmp24 = add nuw nsw i64 %i1.0, 1 - br label %bb11 - -bb25: ; preds = %bb11 - ret void -} diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll deleted file mode 100644 index 1ce6e0991ebb..000000000000 --- a/polly/test/GPGPU/non-read-only-scalars.ll +++ /dev/null @@ -1,168 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR -; -; REQUIRES: pollyacc -; -; #include <stdio.h> -; -; float foo(float A[]) { -; float sum = 0; -; -; for (long i = 0; i < 32; i++) -; A[i] = i; -; -; for (long i = 0; i < 32; i++) -; A[i] += i; -; -; for (long i = 0; i < 32; i++) -; sum += A[i]; -; -; return sum; -; } -; -; int main() { -; float A[32]; -; float sum = foo(A); -; printf("%f\n", sum); -; } - -; CODE: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock; -; CODE-NEXT: dim3 k1_dimGrid; -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k2_dimBlock; -; CODE-NEXT: dim3 k2_dimGrid; -; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_sum_0)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: Stmt_bb4(t0); -; CODE-NEXT: Stmt_bb10(t0); -; CODE-NEXT: } - -; CODE: # kernel1 -; CODE-NEXT: Stmt_bb17(); - -; CODE: # kernel2 -; TODO-NEXT: { -; TODO-NEXT: read(); -; TODO-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) { -; TODO-NEXT: Stmt_bb18(c0); -; TODO-NEXT: if (c0 <= 31) -; TODO-NEXT: Stmt_bb20(c0); -; TODO-NEXT: } -; TODO-NEXT: write(); -; TODO-NEXT: } - - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(ptr addrspace(1) %MemRef_sum_0__phi) -; KERNEL-IR: store float 0.000000e+00, ptr %sum.0.phiops -; KERNEL-IR: [[REGA:%.+]] = addrspacecast ptr addrspace(1) %MemRef_sum_0__phi to ptr -; KERNEL-IR: [[REGB:%.+]] = load float, ptr %sum.0.phiops -; KERNEL-IR: store float [[REGB]], ptr [[REGA]] - -; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(ptr addrspace(1) %MemRef_A, ptr addrspace(1) %MemRef_sum_0__phi, ptr addrspace(1) %MemRef_sum_0) - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1 - -define float @foo(ptr %A) { -bb: - br label %bb3 - -bb3: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %exitcond2 = icmp ne i64 %i.0, 32 - br i1 %exitcond2, label %bb4, label %bb8 - -bb4: ; preds = %bb3 - %tmp = sitofp i64 %i.0 to float - %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0 - store float %tmp, ptr %tmp5, align 4 - br label %bb6 - -bb6: ; preds = %bb4 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb8: ; preds = %bb3 - br label %bb9 - -bb9: ; preds = %bb15, %bb8 - %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ] - %exitcond1 = icmp ne i64 %i1.0, 32 - br i1 %exitcond1, label %bb10, label %bb17 - -bb10: ; preds = %bb9 - %tmp11 = sitofp i64 %i1.0 to float - %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 - %tmp13 = load float, ptr %tmp12, align 4 - %tmp14 = fadd float %tmp13, %tmp11 - store float %tmp14, ptr %tmp12, align 4 - br label %bb15 - -bb15: ; preds = %bb10 - %tmp16 = add nuw nsw i64 %i1.0, 1 - br label %bb9 - -bb17: ; preds = %bb9 - br label %bb18 - -bb18: ; preds = %bb20, %bb17 - %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ] - %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ] - %exitcond = icmp ne i64 %i2.0, 32 - br i1 %exitcond, label %bb19, label %bb25 - -bb19: ; preds = %bb18 - br label %bb20 - -bb20: ; preds = %bb19 - %tmp21 = getelementptr inbounds float, ptr %A, i64 %i2.0 - %tmp22 = load float, ptr %tmp21, align 4 - %tmp23 = fadd float %sum.0, %tmp22 - %tmp24 = add nuw nsw i64 %i2.0, 1 - br label %bb18 - -bb25: ; preds = %bb18 - %sum.0.lcssa = phi float [ %sum.0, %bb18 ] - ret float %sum.0.lcssa -} - -define i32 @main() { -bb: - %A = alloca [32 x float], align 16 - %tmp1 = call float @foo(ptr %A) - %tmp2 = fpext float %tmp1 to double - %tmp3 = call i32 (ptr, ...) @printf(ptr @.str, double %tmp2) #2 - ret i32 0 -} - -declare i32 @printf(ptr, ...) #1 - diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll deleted file mode 100644 index f18f6828a47f..000000000000 --- a/polly/test/GPGPU/non-zero-array-offset.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR -; -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); - -; CODE: dim3 k0_dimBlock(8); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock(8); -; CODE-NEXT: dim3 k1_dimGrid(1); -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb3(t0); - -; CODE: # kernel1 -; CODE-NEXT: Stmt_bb11(t0); - -; IR: %p_dev_array_MemRef_B = call ptr @polly_allocateMemoryForDevice(i64 32) -; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 32) -; IR-NEXT: [[REG0:%.+]] = getelementptr float, ptr %B, i64 8 -; IR-NEXT: call void @polly_copyFromHostToDevice(ptr [[REG0]], ptr %p_dev_array_MemRef_B, i64 32) - -; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_B) -; IR-NEXT: [[REGC:%.+]] = getelementptr float, ptr [[REGA]], i64 -8 - -; void foo(float A[], float B[]) { -; for (long i = 0; i < 8; i++) -; B[i + 8] *= 4; -; -; for (long i = 0; i < 8; i++) -; A[i] *= 12; -; } -; -; #ifdef OUTPUT -; int main() { -; float A[16]; -; -; for (long i = 0; i < 16; i++) { -; __sync_synchronize(); -; A[i] = i; -; } -; -; foo(A, A); -; -; float sum = 0; -; for (long i = 0; i < 16; i++) { -; __sync_synchronize(); -; sum += A[i]; -; } -; -; printf("%f\n", sum); -; } -; #endif -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb2 - -bb2: ; preds = %bb7, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ] - %exitcond1 = icmp ne i64 %i.0, 8 - br i1 %exitcond1, label %bb3, label %bb9 - -bb3: ; preds = %bb2 - %tmp = add nuw nsw i64 %i.0, 8 - %tmp4 = getelementptr inbounds float, ptr %B, i64 %tmp - %tmp5 = load float, ptr %tmp4, align 4 - %tmp6 = fmul float %tmp5, 4.000000e+00 - store float %tmp6, ptr %tmp4, align 4 - br label %bb7 - -bb7: ; preds = %bb3 - %tmp8 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb9: ; preds = %bb2 - br label %bb10 - -bb10: ; preds = %bb15, %bb9 - %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ] - %exitcond = icmp ne i64 %i1.0, 8 - br i1 %exitcond, label %bb11, label %bb17 - -bb11: ; preds = %bb10 - %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0 - %tmp13 = load float, ptr %tmp12, align 4 - %tmp14 = fmul float %tmp13, 1.200000e+01 - store float %tmp14, ptr %tmp12, align 4 - br label %bb15 - -bb15: ; preds = %bb11 - %tmp16 = add nuw nsw i64 %i1.0, 1 - br label %bb10 - -bb17: ; preds = %bb10 - ret void -} diff --git a/polly/test/GPGPU/only-part-of-array-modified.ll b/polly/test/GPGPU/only-part-of-array-modified.ll deleted file mode 100644 index abc380badfb6..000000000000 --- a/polly/test/GPGPU/only-part-of-array-modified.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s -; -; REQUIRES: pollyacc -; -; void foo(float A[], float B[]) { -; for (long i = 0; i < 1024; i++) -; A[2 * i] = B[i]; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2047) * sizeof(i32), cudaMemcpyHostToDevice)); - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -bb: - br label %bb1 - -bb1: ; preds = %bb8, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb10 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp4 = load i32, ptr %tmp, align 4 - %tmp5 = shl nsw i64 %i.0, 1 - %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 - store i32 %tmp4, ptr %tmp6, align 4 - br label %bb8 - -bb8: ; preds = %bb2 - %tmp9 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb10: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll deleted file mode 100644 index e436bd663a4a..000000000000 --- a/polly/test/GPGPU/parametric-loop-bound.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc - -; void foo(long A[], long n) { -; for (long i = 0; i < n; i++) -; A[i] += 100; -; } - -; CODE: if (n >= 1) { -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(n >= 1048545 ? 32768 : (n + 31) / 32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, n); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (n >= 32 * b0 + t0 + 1048576 * c0 + 1) -; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); - -; IR: store i64 %n, ptr %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGA]] - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, i64 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb6, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb8 - -bb2: ; preds = %bb1 - %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp4 = load i64, ptr %tmp3, align 8 - %tmp5 = add nsw i64 %tmp4, 100 - store i64 %tmp5, ptr %tmp3, align 8 - br label %bb6 - -bb6: ; preds = %bb2 - %tmp7 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb8: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/partial_writes.ll b/polly/test/GPGPU/partial_writes.ll deleted file mode 100644 index c3df624df7ac..000000000000 --- a/polly/test/GPGPU/partial_writes.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: opt %loadPolly -polly-import-jscop -polly-codegen-ppcg -polly-stmt-granularity=bb -S < %s \ -; RUN: | FileCheck %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; CHECK: polly_launchKernel - -; Function Attrs: nounwind uwtable -define void @partial_writes() { -bb: - %tmp = tail call ptr @wibble() #2 - br label %bb2 - -bb2: ; preds = %bb11, %bb - %tmp3 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %tmp4 = getelementptr inbounds [1200 x double], ptr %tmp, i64 0, i64 %tmp3 - %tmp5 = load double, ptr %tmp4, align 8, !tbaa !1 - br label %bb6 - -bb6: ; preds = %bb6, %bb2 - %tmp7 = phi double [ undef, %bb2 ], [ undef, %bb6 ] - %tmp8 = phi i64 [ 0, %bb2 ], [ %tmp9, %bb6 ] - store double undef, ptr %tmp4, align 8, !tbaa !1 - %tmp9 = add nuw nsw i64 %tmp8, 1 - %tmp10 = icmp eq i64 %tmp9, 900 - br i1 %tmp10, label %bb11, label %bb6 - -bb11: ; preds = %bb6 - %tmp12 = add nuw nsw i64 %tmp3, 1 - %tmp13 = icmp eq i64 %tmp12, 1200 - br i1 %tmp13, label %bb14, label %bb2 - -bb14: ; preds = %bb11 - ret void -} - -declare ptr @wibble() - - -!llvm.ident = !{!0} - -!0 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"double", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop deleted file mode 100644 index d5b537ee1f05..000000000000 --- a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop +++ /dev/null @@ -1,47 +0,0 @@ -{ - "arrays" : [ - { - "name" : "MemRef_tmp", - "sizes" : [ "*" ], - "type" : "double" - } - ], - "context" : "{ : }", - "name" : "%bb2---%bb14", - "statements" : [ - { - "accesses" : [ - { - "kind" : "read", - "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }" - } - ], - "domain" : "{ Stmt_bb2[i0] : 0 <= i0 <= 1199 }", - "name" : "Stmt_bb2", - "schedule" : "{ Stmt_bb2[i0] -> [i0, 0, 0] }" - }, - { - "accesses" : [ - { - "kind" : "write", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] : i1 <= 898 }" - }, - { - "kind" : "read", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" - }, - { - "kind" : "write", - "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }" - } - ], - "domain" : "{ Stmt_bb6[i0, i1] : 0 <= i0 <= 1199 and 0 <= i1 <= 899 }", - "name" : "Stmt_bb6", - "schedule" : "{ Stmt_bb6[i0, i1] -> [i0, 1, i1] }" - } - ] -} diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll deleted file mode 100644 index acb1f2c4e0e2..000000000000 --- a/polly/test/GPGPU/phi-nodes-in-kernel.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -; Approximate C source: -; void kernel_dynprog(int c[50]) { -; int iter = 0; -; int outl = 0; -; -; while(1) { -; for(int indvar = 1 ; indvar <= 49; indvar++) { -; c[indvar] = undef; -; } -; add78 = c[49] + outl; -; inc80 = iter + 1; -; -; if (true) break; -; -; outl = add78; -; iter = inc80; -; } -;} -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; CODE: cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32))); - -; CODE: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(2); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_c); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_c)); - -; CODE: # kernel0 -; CODE-NEXT: if (32 * b0 + t0 <= 48) -; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); - -; IR-LABEL: call void @polly_freeKernel -; IR: [[REGC:%.+]] = bitcast i32* %{{[0-9]+}} to i8* -; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) - -; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 { -; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9 -; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4 - -define void @kernel_dynprog([50 x i32]* %c) { -entry: - %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49 - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry - %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ] - %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ] - br label %for.body17 - -for.cond15.for.cond12.loopexit_crit_edge: ; preds = %for.body17 - %tmp = load i32, i32* %arrayidx77, align 4 - %add78 = add nsw i32 %tmp, %out_l.055 - %inc80 = add nuw nsw i32 %iter.054, 1 - br i1 false, label %for.cond1.preheader, label %for.end81 - -for.body17: ; preds = %for.body17, %for.cond1.preheader - %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ] - %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71 - store i32 422, i32* %arrayidx69, align 4 - %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1 - %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32 - %exitcond75 = icmp ne i32 %lftr.wideiv74, 50 - br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge - -for.end81: ; preds = %for.cond15.for.cond12.loopexit_crit_edge - ret void -} diff --git a/polly/test/GPGPU/private-memory.ll b/polly/test/GPGPU/private-memory.ll deleted file mode 100644 index d4ba9fa19b39..000000000000 --- a/polly/test/GPGPU/private-memory.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-private \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-private \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void add(float *A) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += 1; -; } - -; CODE: # kernel0 -; CODE: { -; CODE: read(t0); -; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE: Stmt_bb5(t0, c3); -; CODE: write(t0); -; CODE: } - -; KERNEL: %private_array = alloca [1 x float] - -; KERNEL: %polly.access.cast.private_array = bitcast [1 x float]* %private_array to float* -; KERNEL-NEXT: %polly.access.private_array = getelementptr float, float* %polly.access.cast.private_array, i64 0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A -; KERNEL-NEXT: store float %shared.read, float* %polly.access.private_array - -; KERNEL: %polly.access.cast.private_array5 = bitcast [1 x float]* %private_array to float* -; KERNEL-NEXT: %polly.access.private_array6 = getelementptr float, float* %polly.access.cast.private_array5, i64 0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A7 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A8 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A7, i64 %t0 -; KERNEL-NEXT: %shared.write = load float, float* %polly.access.private_array6 -; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A8 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(float* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp6 = load float, float* %tmp, align 4 - %tmp7 = fadd float %tmp6, 1.000000e+00 - store float %tmp7, float* %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll deleted file mode 100644 index c715b8e77b67..000000000000 --- a/polly/test/GPGPU/privatization-simple.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; void f(int A[], int B[], int control, int C[]) { -; int x; -; #pragma scop -; for(int i = 0; i < 1000; i ++) { -; x = 0; -; if(control) x = C[i]; -; B[i] = x * A[i]; -; -; } -; #pragma endscop -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @f(ptr %A, ptr %B, i32 %control, ptr %C) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] - %tobool = icmp eq i32 %control, 0 - br i1 %tobool, label %if.end, label %if.then - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv - %tmp4 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %for.body, %if.then - %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp8 = load i32, ptr %arrayidx2, align 4 - %mul = mul nsw i32 %tmp8, %x.0 - %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - store i32 %mul, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll deleted file mode 100644 index fbb291575146..000000000000 --- a/polly/test/GPGPU/privatization.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: checkPrivatization -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 - - -; Check that kernel launch is generated in host IR. -; the declare would not be generated unless a call to a kernel exists. -; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr) - -; -; -; void checkPrivatization(int A[], int B[], int C[], int control) { -; int x; -; #pragma scop -; for (int i = 0; i < 1000; i++) { -; x = 0; -; if (control) -; x += C[i]; -; -; B[i] = x * A[i]; -; } -; #pragma endscop -; } -; -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -define void @checkPrivatization(ptr %A, ptr %B, ptr %C, i32 %control) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %if.end - %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ] - %tobool = icmp eq i32 %control, 0 - br i1 %tobool, label %if.end, label %if.then - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv - %tmp4 = load i32, ptr %arrayidx, align 4 - br label %if.end - -if.end: ; preds = %for.body, %if.then - %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ] - %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv - %tmp9 = load i32, ptr %arrayidx2, align 4 - %mul = mul nsw i32 %tmp9, %x.0 - %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv - store i32 %mul, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %if.end - ret void -} diff --git a/polly/test/GPGPU/region-stmt.ll b/polly/test/GPGPU/region-stmt.ll deleted file mode 100644 index 8e392fb30062..000000000000 --- a/polly/test/GPGPU/region-stmt.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(4); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0); - -; IR: @polly_initContext - -; KERNEL-IR: kernel_0 - -; REQUIRES: pollyacc - -; void foo(float A[], float B[]) { -; for (long i = 0; i < 128; i++) -; if (A[i] == 42) -; B[i] += 2 * i; -; else -; B[i] += 4 * i; -; } -; -source_filename = "/tmp/test.c" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(ptr %A, ptr %B) { -entry: - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] - %exitcond = icmp ne i64 %i.0, 128 - br i1 %exitcond, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %arrayidx = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp = load float, ptr %arrayidx, align 4 - %cmp1 = fcmp oeq float %tmp, 4.200000e+01 - br i1 %cmp1, label %if.then, label %if.else - -if.then: ; preds = %for.body - %mul = shl nsw i64 %i.0, 1 - %conv = sitofp i64 %mul to float - %arrayidx2 = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp1 = load float, ptr %arrayidx2, align 4 - %add = fadd float %tmp1, %conv - store float %add, ptr %arrayidx2, align 4 - br label %if.end - -if.else: ; preds = %for.body - %mul3 = shl nsw i64 %i.0, 2 - %conv4 = sitofp i64 %mul3 to float - %arrayidx5 = getelementptr inbounds float, ptr %B, i64 %i.0 - %tmp2 = load float, ptr %arrayidx5, align 4 - %add6 = fadd float %tmp2, %conv4 - store float %add6, ptr %arrayidx5, align 4 - br label %if.end - -if.end: ; preds = %if.else, %if.then - br label %for.inc - -for.inc: ; preds = %if.end - %inc = add nuw nsw i64 %i.0, 1 - br label %for.cond - -for.end: ; preds = %for.cond - ret void -} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll deleted file mode 100644 index 326236cf92fd..000000000000 --- a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -define void @kernel_dynprog(ptr %sum_c) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry - br label %for.body3 - -for.cond1.loopexit: ; preds = %for.end - %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1 - %exitcond57 = icmp ne i64 %indvars.iv.next56, 49 - br i1 %exitcond57, label %for.body3, label %for.inc55 - -for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader - %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] - %indvars.iv48 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next49, %for.cond1.loopexit ] - %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 - %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv48, i64 %indvars.iv55 - store i32 0, ptr %arrayidx10, align 4 - %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv48 - br label %for.end - -for.end: ; preds = %for.body3 - br label %for.cond1.loopexit - -for.inc55: ; preds = %for.cond1.loopexit - ret void -} diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll deleted file mode 100644 index 2024f006c53a..000000000000 --- a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s -check-prefix=KERNEL-IR - -; REQUIRES: pollyacc - -; Ensure that no dead instructions are emitted between the store and the -; branch instruction of the ScopStmt. At some point, our dead-code-elimination -; did not remove code that was inserted to compute the old (unused) branch -; condition. This code referred to CPU registers and consequently resulted -; in invalid bitcode. - -; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4 -; KERNEL-IR-NEXT: br label %polly.merge - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @kernel_dynprog(ptr %sum_c) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %entry - br label %for.body3 - -for.cond4.for.cond1.loopexit_crit_edge: ; preds = %for.end - br label %for.cond1.loopexit - -for.cond1.loopexit: ; preds = %for.cond4.for.cond1.loopexit_crit_edge - br i1 undef, label %for.body3, label %for.inc55 - -for.body3: ; preds = %for.cond1.loopexit, %for.cond1.preheader - %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ] - %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1 - br label %for.body6 - -for.body6: ; preds = %for.end, %for.body3 - %indvars.iv50 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next51, %for.end ] - %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 %indvars.iv55 - store i32 0, ptr %arrayidx10, align 4 - %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv50 - br i1 %cmp1334, label %for.body14.lr.ph, label %for.end - -for.body14.lr.ph: ; preds = %for.body6 - br label %for.body14 - -for.body14: ; preds = %for.body14, %for.body14.lr.ph - %arrayidx32 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 0 - br i1 false, label %for.body14, label %for.cond12.for.end_crit_edge - -for.cond12.for.end_crit_edge: ; preds = %for.body14 - br label %for.end - -for.end: ; preds = %for.cond12.for.end_crit_edge, %for.body6 - %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1 - %lftr.wideiv53 = trunc i64 %indvars.iv.next51 to i32 - %exitcond54 = icmp ne i32 %lftr.wideiv53, 50 - br i1 %exitcond54, label %for.body6, label %for.cond4.for.cond1.loopexit_crit_edge - -for.inc55: ; preds = %for.cond1.loopexit - unreachable -} diff --git a/polly/test/GPGPU/run-time-check.ll b/polly/test/GPGPU/run-time-check.ll deleted file mode 100644 index 3b04c3e01593..000000000000 --- a/polly/test/GPGPU/run-time-check.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR -; -; REQUIRES: pollyacc -; -; void foo(long n, float A[][32]) { -; for (long i = 0; i < n; i++) -; for (long j = 0; j < n; j++) -; A[i][j] += A[i + 1][j + 1]; -; } - -; IR: %tmp = icmp slt i64 %i.0, %n -; IR-NEXT: br i1 %tmp, label %bb2, label %polly.merge_new_and_old - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(i64 %n, ptr %A) { -bb: - br label %bb1 - -bb1: ; preds = %bb15, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp16, %bb15 ] - %tmp = icmp slt i64 %i.0, %n - br i1 %tmp, label %bb2, label %bb17 - -bb2: ; preds = %bb1 - br label %bb3 - -bb3: ; preds = %bb12, %bb2 - %j.0 = phi i64 [ 0, %bb2 ], [ %tmp13, %bb12 ] - %exitcond = icmp ne i64 %j.0, %n - br i1 %exitcond, label %bb4, label %bb14 - -bb4: ; preds = %bb3 - %tmp5 = add nuw nsw i64 %j.0, 1 - %tmp6 = add nuw nsw i64 %i.0, 1 - %tmp7 = getelementptr inbounds [32 x float], ptr %A, i64 %tmp6, i64 %tmp5 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = getelementptr inbounds [32 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp10 = load float, ptr %tmp9, align 4 - %tmp11 = fadd float %tmp10, %tmp8 - store float %tmp11, ptr %tmp9, align 4 - br label %bb12 - -bb12: ; preds = %bb4 - %tmp13 = add nuw nsw i64 %j.0, 1 - br label %bb3 - -bb14: ; preds = %bb3 - br label %bb15 - -bb15: ; preds = %bb14 - %tmp16 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb17: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll deleted file mode 100644 index 0313d64e976c..000000000000 --- a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} -; -; void foo(float A[], int n) { -; for (long j = 0; j < n; j++) -; A[j + n] += 42; -; } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i32 %n) - -define void @foo(ptr %A, i32 %n) { -bb: - br label %bb1 - -bb1: ; preds = %bb9, %bb - %j.0 = phi i64 [ 0, %bb ], [ %tmp10, %bb9 ] - %tmp = sext i32 %n to i64 - %tmp2 = icmp slt i64 %j.0, %tmp - br i1 %tmp2, label %bb3, label %bb11 - -bb3: ; preds = %bb1 - %tmp4 = sext i32 %n to i64 - %tmp5 = add nsw i64 %j.0, %tmp4 - %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5 - %tmp7 = load float, ptr %tmp6, align 4 - %tmp8 = fadd float %tmp7, 4.200000e+01 - store float %tmp8, ptr %tmp6, align 4 - br label %bb9 - -bb9: ; preds = %bb3 - %tmp10 = add nuw nsw i64 %j.0, 1 - br label %bb1 - -bb11: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-param-and-value-use.ll b/polly/test/GPGPU/scalar-param-and-value-use.ll deleted file mode 100644 index 0301d88e16ac..000000000000 --- a/polly/test/GPGPU/scalar-param-and-value-use.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(long n, float A[][n]) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 32; j++) -; A[i][j] += A[i + 1][j + 1]; -; } - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; This test case failed at some point as %n was only available in this kernel -; when referenced through an isl_id in an isl ast expression, but not when -; it was referenced from a SCEV or instruction that not part of any loop -; bound. - -; IR: %polly.access.mul.MemRef_A = mul nsw i64 {{.*}}, %n - -define void @foo(i64 %n, ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb19, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp20, %bb19 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb21 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb16, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp17, %bb16 ] - %exitcond = icmp ne i64 %j.0, 32 - br i1 %exitcond, label %bb5, label %bb18 - -bb5: ; preds = %bb4 - %tmp = add nuw nsw i64 %j.0, 1 - %tmp6 = add nuw nsw i64 %i.0, 1 - %tmp7 = mul nsw i64 %tmp6, %n - %tmp8 = getelementptr inbounds float, ptr %A, i64 %tmp7 - %tmp9 = getelementptr inbounds float, ptr %tmp8, i64 %tmp - %tmp10 = load float, ptr %tmp9, align 4 - %tmp11 = mul nsw i64 %i.0, %n - %tmp12 = getelementptr inbounds float, ptr %A, i64 %tmp11 - %tmp13 = getelementptr inbounds float, ptr %tmp12, i64 %j.0 - %tmp14 = load float, ptr %tmp13, align 4 - %tmp15 = fadd float %tmp14, %tmp10 - store float %tmp15, ptr %tmp13, align 4 - br label %bb16 - -bb16: ; preds = %bb5 - %tmp17 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb18: ; preds = %bb4 - br label %bb19 - -bb19: ; preds = %bb18 - %tmp20 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb21: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll deleted file mode 100644 index f20a809c7c83..000000000000 --- a/polly/test/GPGPU/scalar-parameter-fp128.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @fp128(ptr %A, fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 - %tmp3 = load fp128, ptr %tmp, align 4 - %tmp4 = fadd fp128 %tmp3, %b - store fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll deleted file mode 100644 index 127096256812..000000000000 --- a/polly/test/GPGPU/scalar-parameter-half.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(half A[], half b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @half(ptr %A, half %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds half, ptr %A, i64 %i.0 - %tmp3 = load half, ptr %tmp, align 4 - %tmp4 = fadd half %tmp3, %b - store half %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll deleted file mode 100644 index 06fb46dd917e..000000000000 --- a/polly/test/GPGPU/scalar-parameter-i120.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i120 A[], i120 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i120(ptr %A, i120 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i120 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i120, ptr %A, i120 %i.0 - %tmp3 = load i120, ptr %tmp, align 4 - %tmp4 = add i120 %tmp3, %b - store i120 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i120 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll deleted file mode 100644 index 8e54cf4636d4..000000000000 --- a/polly/test/GPGPU/scalar-parameter-i128.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; void foo(i128 A[], i128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i128(ptr %A, i128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i128 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i128, ptr %A, i128 %i.0 - %tmp3 = load i128, ptr %tmp, align 4 - %tmp4 = add i128 %tmp3, %b - store i128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i128 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll deleted file mode 100644 index 5c36b3fd62cb..000000000000 --- a/polly/test/GPGPU/scalar-parameter-i3000.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i3000 A[], i3000 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i3000(ptr %A, i3000 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i3000 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i3000, ptr %A, i3000 %i.0 - %tmp3 = load i3000, ptr %tmp, align 4 - %tmp4 = add i3000 %tmp3, %b - store i3000 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i3000 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll deleted file mode 100644 index a672cd5c1cdc..000000000000 --- a/polly/test/GPGPU/scalar-parameter-i80.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits" - -; void foo(i80 A[], i80 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @i80(ptr %A, i80 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i80 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i80, ptr %A, i80 %i.0 - %tmp3 = load i80, ptr %tmp, align 4 - %tmp4 = add i80 %tmp3, %b - store i80 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i80 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll deleted file mode 100644 index 11dfd68ede9b..000000000000 --- a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @ppc_fp128(ptr %A, ppc_fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds ppc_fp128, ptr %A, i64 %i.0 - %tmp3 = load ppc_fp128, ptr %tmp, align 4 - %tmp4 = fadd ppc_fp128 %tmp3, %b - store ppc_fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll deleted file mode 100644 index f20a809c7c83..000000000000 --- a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today with "LowerFormalArguments didn't emit the correct number of values!" - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @fp128(ptr %A, fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0 - %tmp3 = load fp128, ptr %tmp, align 4 - %tmp4 = fadd fp128 %tmp3, %b - store fp128 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll deleted file mode 100644 index e416c93211d5..000000000000 --- a/polly/test/GPGPU/scalar-parameter.ll +++ /dev/null @@ -1,411 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -S < %s | \ -; RUN: FileCheck -check-prefix=IR %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; XFAIL: * - -; REQUIRES: pollyacc, target=nvptx{{.*}} - -; This fails today due to extensive output differences from when the test was written. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, float %MemRef_b) - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(float A[], float b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @float(ptr %A, float %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp3 = load float, ptr %tmp, align 4 - %tmp4 = fadd float %tmp3, %b - store float %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, double %MemRef_b) -; KERNEL-NEXT: entry: -; KERNEL-NEXT: %b.s2a = alloca double -; KERNEL-NEXT: store double %MemRef_b, ptr %b.s2a - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(double A[], double b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @double(ptr %A, double %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds double, ptr %A, i64 %i.0 - %tmp3 = load double, ptr %tmp, align 4 - %tmp4 = fadd double %tmp3, %b - store double %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i1 A[], i1 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i1(ptr %A, i1 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i1, ptr %A, i64 %i.0 - %tmp3 = load i1, ptr %tmp, align 4 - %tmp4 = add i1 %tmp3, %b - store i1 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i3 A[], i3 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i3(ptr %A, i3 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i3, ptr %A, i64 %i.0 - %tmp3 = load i3, ptr %tmp, align 4 - %tmp4 = add i3 %tmp3, %b - store i3 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i8 A[], i32 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i8(ptr %A, i8 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i8, ptr %A, i64 %i.0 - %tmp3 = load i8, ptr %tmp, align 4 - %tmp4 = add i8 %tmp3, %b - store i8 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; IR-LABEL: @i8 - -; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A) -; IR-NEXT: store ptr [[REGA:%.+]], ptr %polly_launch_0_param_0 -; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params -; IR-NEXT: store i8 %b, ptr %polly_launch_0_param_1 -; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1 -; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGD]] - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i32 A[], i32 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i32(ptr %A, i32 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i32, ptr %A, i64 %i.0 - %tmp3 = load i32, ptr %tmp, align 4 - %tmp4 = add i32 %tmp3, %b - store i32 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i60 A[], i60 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i60(ptr %A, i60 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i60, ptr %A, i64 %i.0 - %tmp3 = load i60, ptr %tmp, align 4 - %tmp4 = add i60 %tmp3, %b - store i60 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i64 A[], i64 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -define void @i64(ptr %A, i64 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i64, ptr %A, i64 %i.0 - %tmp3 = load i64, ptr %tmp, align 4 - %tmp4 = add i64 %tmp3, %b - store i64 %tmp4, ptr %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} diff --git a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll deleted file mode 100644 index 31110437fdca..000000000000 --- a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: opt %loadPolly -polly-acc-dump-code -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-acc-dump-code -polly-stmt-granularity=bb \ -; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE - -; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \ -; RUN: -polly-invariant-load-hoisting -polly-stmt-granularity=bb < %s \ -; RUN: | FileCheck %s -check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Invariant Accesses: { -; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] -; SCOP-NEXT: { Stmt_loop_a[i0] -> MemRef_p[0] }; -; SCOP-NEXT: Execution Context: { : } -; SCOP-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: if (32 * b0 + t0 <= 1025) { -; CODE-NEXT: Stmt_loop(32 * b0 + t0); -; CODE-NEXT: write(0); -; CODE-NEXT: } -; CODE-NEXT: sync0(); -; CODE-NEXT: } - -; Check that we generate a correct "always false" branch. -; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb - -; This test case checks that we generate correct code if PPCGCodeGeneration -; decides a build is unsuccessful with invariant load hoisting enabled. -; -; There is a conditional branch which switches between the original code and -; the new code. We try to set this conditional branch to branch on false. -; However, invariant load hoisting changes the structure of the scop, so we -; need to change the way we *locate* this instruction. - -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.12.0" - -define void @foo(ptr %A, ptr %p) { -entry: - br label %loop - -loop: - %indvar = phi i64 [0, %entry], [%indvar.next, %loop] - %indvar.next = add i64 %indvar, 1 - %invariant = load float, ptr %p - %ptr = getelementptr float, ptr %A, i64 %indvar - store float 42.0, ptr %ptr - %cmp = icmp sle i64 %indvar, 1024 - br i1 %cmp, label %loop, label %loop2 - -loop2: - %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2] - %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2] - %indvar2.next = add i64 %indvar2, 1 - store float %indvar2f, ptr %A - %cmp2 = icmp sle i64 %indvar2, 1024 - br i1 %cmp2, label %loop2, label %end - -end: - ret void -} diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll deleted file mode 100644 index 4a49c53d66c7..000000000000 --- a/polly/test/GPGPU/scheduler-timeout.ll +++ /dev/null @@ -1,174 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; This test case took at some point forever to schedule, as the isl scheduler -; seems to have problems if domain constraints appear in the dependences -; provided to the scheduler. - -; /* D := alpha*A*B*C + beta*D */ -; for (i = 0; i < _PB_NI; i++) -; for (j = 0; j < _PB_NJ; j++) -; { -; tmp[i][j] = 0; -; for (k = 0; k < _PB_NK; ++k) -; tmp[i][j] += alpha * A[i][k] * B[k][j]; -; } -; for (i = 0; i < _PB_NI; i++) -; for (j = 0; j < _PB_NL; j++) -; { -; D[i][j] *= beta; -; for (k = 0; k < _PB_NJ; ++k) -; D[i][j] += tmp[i][k] * C[k][j]; -; } - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(16, 32); -; CODE-NEXT: dim3 k0_dimGrid(128, 128); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: { -; CODE-NEXT: dim3 k1_dimBlock(16, 32); -; CODE-NEXT: dim3 k1_dimGrid(128, 128); -; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost)); - -; CODE: # kernel0 -; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { -; CODE-NEXT: if (c2 == 0) -; CODE-NEXT: Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); -; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) -; CODE-NEXT: Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); -; CODE-NEXT: } - -; CODE: # kernel1 -; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 1; c4 += 1) { -; CODE-NEXT: if (c2 == 0) -; CODE-NEXT: Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4); -; CODE-NEXT: for (int c5 = 0; c5 <= 31; c5 += 1) -; CODE-NEXT: Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5); -; CODE-NEXT: } - - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, ptr nocapture) #0 - -; Function Attrs: nounwind uwtable -define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, ptr %tmp, ptr %A, ptr %B, ptr %C, ptr %D) #1 { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.cond4.preheader - -for.cond4.preheader: ; preds = %entry.split, %for.inc28 - %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ] - br label %for.body6 - -for.cond31.preheader: ; preds = %for.inc28 - br label %for.cond34.preheader - -for.body6: ; preds = %for.cond4.preheader, %for.inc25 - %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ] - %arrayidx8 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 - store float 0.000000e+00, ptr %arrayidx8, align 4, !tbaa !1 - br label %for.body11 - -for.body11: ; preds = %for.body6, %for.body11 - %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ] - %arrayidx15 = getelementptr inbounds [4096 x float], ptr %A, i64 %indvars.iv19, i64 %indvars.iv13 - %tmp22 = load float, ptr %arrayidx15, align 4, !tbaa !1 - %mul = fmul float %tmp22, %alpha - %arrayidx19 = getelementptr inbounds [4096 x float], ptr %B, i64 %indvars.iv13, i64 %indvars.iv16 - %tmp23 = load float, ptr %arrayidx19, align 4, !tbaa !1 - %mul20 = fmul float %mul, %tmp23 - %arrayidx24 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16 - %tmp24 = load float, ptr %arrayidx24, align 4, !tbaa !1 - %add = fadd float %tmp24, %mul20 - store float %add, ptr %arrayidx24, align 4, !tbaa !1 - %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1 - %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096 - br i1 %exitcond15, label %for.body11, label %for.inc25 - -for.inc25: ; preds = %for.body11 - %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1 - %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096 - br i1 %exitcond18, label %for.body6, label %for.inc28 - -for.inc28: ; preds = %for.inc25 - %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1 - %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096 - br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader - -for.cond34.preheader: ; preds = %for.cond31.preheader, %for.inc65 - %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ] - br label %for.body36 - -for.body36: ; preds = %for.cond34.preheader, %for.inc62 - %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ] - %arrayidx40 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 - %tmp25 = load float, ptr %arrayidx40, align 4, !tbaa !1 - %mul41 = fmul float %tmp25, %beta - store float %mul41, ptr %arrayidx40, align 4, !tbaa !1 - br label %for.body44 - -for.body44: ; preds = %for.body36, %for.body44 - %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ] - %arrayidx48 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv10, i64 %indvars.iv - %tmp26 = load float, ptr %arrayidx48, align 4, !tbaa !1 - %arrayidx52 = getelementptr inbounds [4096 x float], ptr %C, i64 %indvars.iv, i64 %indvars.iv7 - %tmp27 = load float, ptr %arrayidx52, align 4, !tbaa !1 - %mul53 = fmul float %tmp26, %tmp27 - %arrayidx57 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7 - %tmp28 = load float, ptr %arrayidx57, align 4, !tbaa !1 - %add58 = fadd float %tmp28, %mul53 - store float %add58, ptr %arrayidx57, align 4, !tbaa !1 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp ne i64 %indvars.iv.next, 4096 - br i1 %exitcond, label %for.body44, label %for.inc62 - -for.inc62: ; preds = %for.body44 - %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1 - %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096 - br i1 %exitcond9, label %for.body36, label %for.inc65 - -for.inc65: ; preds = %for.inc62 - %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1 - %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096 - br i1 %exitcond12, label %for.cond34.preheader, label %for.end67 - -for.end67: ; preds = %for.inc65 - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"float", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll deleted file mode 100644 index cd2b1705a388..000000000000 --- a/polly/test/GPGPU/shared-memory-scalar.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; void add(float *A, float alpha) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += alpha; -; } - -; CODE: read(t0); -; CODE-NEXT: sync0(); -; CODE-NEXT: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE-NEXT: Stmt_bb5(t0, c3); -; CODE-NEXT: sync1(); -; CODE-NEXT: write(t0); - -; This test case was intended to test code generation for scalars stored -; in shared memory. However, after properly marking the scalar as read-only -; the scalar is not stored any more in shared memory. We still leave this -; test case as documentation if we every forget to mark scalars as read-only. - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(ptr %A, float %alpha) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, ptr %A, i64 %i.0 - %tmp6 = load float, ptr %tmp, align 4 - %tmp7 = fadd float %tmp6, %alpha - store float %tmp7, ptr %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll deleted file mode 100644 index 6ee51650295f..000000000000 --- a/polly/test/GPGPU/shared-memory-two-dimensional.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void foo(float A[], float b[][8]) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 16; j++) -; for (long k = 0; k < 8; k++) -; A[i] += j * k * b[j][k]; -; } - - -; CODE: # kernel0 -; CODE-NEXT: { -; CODE-NEXT: if (t0 <= 7) -; CODE-NEXT: for (int c0 = 0; c0 <= 15; c0 += 1) -; CODE-NEXT: read(c0, t0); -; CODE-NEXT: read(t0); -; CODE-NEXT: sync0(); -; CODE-NEXT: for (int c3 = 0; c3 <= 15; c3 += 1) -; CODE-NEXT: for (int c4 = 0; c4 <= 7; c4 += 1) -; CODE-NEXT: Stmt_bb8(t0, c3, c4); -; CODE-NEXT: sync1(); -; CODE-NEXT: write(t0); -; CODE-NEXT: } - -; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4 - -; KERNEL: %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8 -; KERNEL-NEXT: %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0 -; KERNEL-NEXT: %polly.access.MemRef_b = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_b -; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(float* %A, [8 x float]* %b) { -bb: - br label %bb3 - -bb3: ; preds = %bb22, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ] - %exitcond2 = icmp ne i64 %i.0, 32 - br i1 %exitcond2, label %bb4, label %bb24 - -bb4: ; preds = %bb3 - br label %bb5 - -bb5: ; preds = %bb19, %bb4 - %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ] - %exitcond1 = icmp ne i64 %j.0, 16 - br i1 %exitcond1, label %bb6, label %bb21 - -bb6: ; preds = %bb5 - br label %bb7 - -bb7: ; preds = %bb16, %bb6 - %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ] - %exitcond = icmp ne i64 %k.0, 8 - br i1 %exitcond, label %bb8, label %bb18 - -bb8: ; preds = %bb7 - %tmp = mul nuw nsw i64 %j.0, %k.0 - %tmp9 = sitofp i64 %tmp to float - %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0 - %tmp11 = load float, float* %tmp10, align 4 - %tmp12 = fmul float %tmp9, %tmp11 - %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp14 = load float, float* %tmp13, align 4 - %tmp15 = fadd float %tmp14, %tmp12 - store float %tmp15, float* %tmp13, align 4 - br label %bb16 - -bb16: ; preds = %bb8 - %tmp17 = add nuw nsw i64 %k.0, 1 - br label %bb7 - -bb18: ; preds = %bb7 - br label %bb19 - -bb19: ; preds = %bb18 - %tmp20 = add nuw nsw i64 %j.0, 1 - br label %bb5 - -bb21: ; preds = %bb5 - br label %bb22 - -bb22: ; preds = %bb21 - %tmp23 = add nuw nsw i64 %i.0, 1 - br label %bb3 - -bb24: ; preds = %bb3 - ret void -} diff --git a/polly/test/GPGPU/shared-memory.ll b/polly/test/GPGPU/shared-memory.ll deleted file mode 100644 index 920db0d37127..000000000000 --- a/polly/test/GPGPU/shared-memory.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-acc-use-shared \ -; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \ -; RUN: FileCheck -check-prefix=KERNEL %s - -; REQUIRES: pollyacc - -; void add(float *A) { -; for (long i = 0; i < 32; i++) -; for (long j = 0; j < 10; j++) -; A[i] += 1; -; } - -; CODE: # kernel0 -; CODE: { -; CODE: read(t0); -; CODE: sync0(); -; CODE: for (int c3 = 0; c3 <= 9; c3 += 1) -; CODE: Stmt_bb5(t0, c3); -; CODE: sync1(); -; CODE: write(t0); -; CODE: } - -; KERNEL: @shared_MemRef_A = internal addrspace(3) global [32 x float] zeroinitializer, align 4 - -; KERNEL: %polly.access.shared_MemRef_A = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0 -; KERNEL-NEXT: %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A -; KERNEL-NEXT: store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_A - -; KERNEL: %polly.access.shared_MemRef_A3 = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0 -; KERNEL-NEXT: %polly.access.cast.MemRef_A4 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; KERNEL-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A4, i64 %t0 -; KERNEL-NEXT: %shared.write = load float, float addrspace(3)* %polly.access.shared_MemRef_A3 -; KERNEL-NEXT: store float %shared.write, float addrspace(1)* %polly.access.MemRef_A5 - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @add(float* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb11, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] - %exitcond1 = icmp ne i64 %i.0, 32 - br i1 %exitcond1, label %bb3, label %bb13 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb8, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ] - %exitcond = icmp ne i64 %j.0, 10 - br i1 %exitcond, label %bb5, label %bb10 - -bb5: ; preds = %bb4 - %tmp = getelementptr inbounds float, float* %A, i64 %i.0 - %tmp6 = load float, float* %tmp, align 4 - %tmp7 = fadd float %tmp6, 1.000000e+00 - store float %tmp7, float* %tmp, align 4 - br label %bb8 - -bb8: ; preds = %bb5 - %tmp9 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb10: ; preds = %bb4 - br label %bb11 - -bb11: ; preds = %bb10 - %tmp12 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb13: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/simple-managed-memory-rewrite.ll b/polly/test/GPGPU/simple-managed-memory-rewrite.ll deleted file mode 100644 index d8c5b320e2b0..000000000000 --- a/polly/test/GPGPU/simple-managed-memory-rewrite.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP - -; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ -; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ -; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR - -; REQUIRES: pollyacc - -; SCOP: Function: f -; SCOP-NEXT: Region: %for.body---%for.end -; SCOP-NEXT: Max Loop Depth: 1 -; SCOP: i32 MemRef_A[*]; - -; Check that we generate a constructor call for @A.toptr -; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr {{.*}}, ptr @A.toptr }] - -; Check that we generate a constructor -; 4 bytes * 100 = 400 -; HOST-IR: define void {{.*}}constructor() { -; HOST-IR-NEXT: entry: -; HOST-IR-NEXT: %mem.raw = call ptr @polly_mallocManaged(i64 400) -; HOST-IR-NEXT: store ptr %mem.raw, ptr @A.toptr -; HOST-IR-NEXT: ret void -; HOST-IR-NEXT: } - -; HOST-IR-NOT: @A - -source_filename = "test.c" -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.12.0" - -@A = internal global [100 x i32] zeroinitializer, align 16 - -define void @f() { -entry: - br label %entry.split - -entry.split: ; preds = %entry - br label %for.body - -for.body: ; preds = %entry.split, %for.body - %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [100 x i32], ptr @A, i64 0, i64 %indvars.iv1 - store i32 42, ptr %arrayidx, align 4, !tbaa !3 - %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0 - - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0 - -attributes #0 = { argmemonly nounwind } - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 6.0.0"} -!3 = !{!4, !4, i64 0} -!4 = !{!"int", !5, i64 0} -!5 = !{!"omnipotent char", !6, i64 0} -!6 = !{!"Simple C/C++ TBAA"} diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll deleted file mode 100644 index 5e2c85de4251..000000000000 --- a/polly/test/GPGPU/size-cast.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ -; RUN: FileCheck %s -check-prefix=IR - -; REQUIRES: pollyacc - -; This test case ensures that we properly sign-extend the types we are using. - -; CODE: if (arg >= 1 && arg1 == 0) { -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(arg >= 1048545 ? 32768 : (arg + 31) / 32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost)); -; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg3)); -; CODE-NEXT cudaCheckReturn(cudaFree(dev_MemRef_arg2)); - -; CODE: # kernel0 -; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1) -; CODE-NEXT: if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1) -; CODE-NEXT: Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0); - -; IR-LABEL: call ptr @polly_initContextCUDA() -; IR: sext i32 %arg to i64 -; IR-NEXT: mul i64 -; IR-NEXT: @polly_allocateMemoryForDevice - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @hoge(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3) { -bb: - br label %bb4 - -bb4: ; preds = %bb13, %bb - br label %bb6 - -bb5: ; preds = %bb13 - ret void - -bb6: ; preds = %bb6, %bb4 - %tmp = phi i64 [ 0, %bb4 ], [ %tmp10, %bb6 ] - %tmp7 = getelementptr inbounds double, ptr %arg3, i64 %tmp - %tmp8 = load double, ptr %tmp7, align 8 - %tmp9 = getelementptr inbounds [1000 x double], ptr %arg2, i64 0, i64 %tmp - store double %tmp8, ptr %tmp9, align 8 - %tmp10 = add nuw nsw i64 %tmp, 1 - %tmp11 = zext i32 %arg to i64 - %tmp12 = icmp ne i64 %tmp10, %tmp11 - br i1 %tmp12, label %bb6, label %bb13 - -bb13: ; preds = %bb6 - %tmp14 = zext i32 %arg1 to i64 - %tmp15 = icmp ne i64 0, %tmp14 - br i1 %tmp15, label %bb4, label %bb5 -} diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll deleted file mode 100644 index 3715e1ec4427..000000000000 --- a/polly/test/GPGPU/spir-codegen.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir32 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck %s - -; REQUIRES: pollyacc - -; CHECK: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; CHECK-NEXT: target triple = "spir-unknown-unknown" - -; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; CHECK-NEXT: entry: -; CHECK-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() -; CHECK-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 -; CHECK-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() -; CHECK-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 -; CHECK-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() -; CHECK-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 -; CHECK-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() -; CHECK-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 -; CHECK-NEXT: br label %polly.loop_preheader - -; CHECK-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 -; CHECK-NEXT: ret void - -; CHECK-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader -; CHECK-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] -; CHECK-NEXT: %4 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %5 = add nsw i64 %4, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %6 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %7 = add nsw i64 %6, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %8 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %9 = add nsw i64 %7, %8 -; CHECK-NEXT: br label %polly.stmt.bb5 - -; CHECK-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header -; CHECK-NEXT: %10 = mul i64 %5, %9 -; CHECK-NEXT: %p_tmp6 = sitofp i64 %10 to float -; CHECK-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; CHECK-NEXT: %11 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %12 = add nsw i64 %11, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 -; CHECK-NEXT: %13 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %14 = add nsw i64 %13, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %15 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %16 = add nsw i64 %14, %15 -; CHECK-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 -; CHECK-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A -; CHECK-NEXT: %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4 -; CHECK-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 -; CHECK-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)* -; CHECK-NEXT: %17 = mul nsw i64 32, %__gen_ocl_get_group_id0 -; CHECK-NEXT: %18 = add nsw i64 %17, %__gen_ocl_get_local_id0 -; CHECK-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 -; CHECK-NEXT: %19 = mul nsw i64 32, %__gen_ocl_get_group_id1 -; CHECK-NEXT: %20 = add nsw i64 %19, %__gen_ocl_get_local_id1 -; CHECK-NEXT: %21 = mul nsw i64 16, %polly.indvar -; CHECK-NEXT: %22 = add nsw i64 %20, %21 -; CHECK-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 -; CHECK-NEXT: %polly.access.MemRef_A4 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3 -; CHECK-NEXT: store float %p_tmp9, float addrspace(1)* %polly.access.MemRef_A4, align 4 -; CHECK-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 -; CHECK-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1 -; CHECK-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit - -; CHECK-LABEL: polly.loop_preheader: ; preds = %entry -; CHECK-NEXT: br label %polly.loop_header - -; CHECK: attributes #0 = { "polly.skip.fn" } - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop([1024 x float]* %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, float* %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, float* %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/spir-typesize.ll b/polly/test/GPGPU/spir-typesize.ll deleted file mode 100644 index fce17c54e6e9..000000000000 --- a/polly/test/GPGPU/spir-typesize.ll +++ /dev/null @@ -1,90 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir64 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck -check-prefix=I64 %s - -; RUN: opt %loadPolly -polly-codegen-ppcg \ -; RUN: -polly-gpu-arch=spir32 \ -; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \ -; RUN: FileCheck -check-prefix=I32 %s - -; REQUIRES: pollyacc - -; This test case checks whether the openCl runtime functions (get_local_id/get_group_id) return the right types for 32 and 64bit devices. - -; I32: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; I32-NEXT: target triple = "spir-unknown-unknown" - -; I32-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; I32-NEXT: entry: -; I32-NEXT: %0 = call i32 @__gen_ocl_get_group_id0() -; I32-NEXT: %__gen_ocl_get_group_id0 = zext i32 %0 to i64 -; I32-NEXT: %1 = call i32 @__gen_ocl_get_group_id1() -; I32-NEXT: %__gen_ocl_get_group_id1 = zext i32 %1 to i64 -; I32-NEXT: %2 = call i32 @__gen_ocl_get_local_id0() -; I32-NEXT: %__gen_ocl_get_local_id0 = zext i32 %2 to i64 -; I32-NEXT: %3 = call i32 @__gen_ocl_get_local_id1() -; I32-NEXT: %__gen_ocl_get_local_id1 = zext i32 %3 to i64 -; I32-NEXT: br label %polly.loop_preheader - -; I64: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" -; I64-next: target triple = "spir64-unknown-unknown" - -; I64-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 { -; I64-NEXT: entry: -; I64-NEXT: %0 = call i64 @__gen_ocl_get_group_id0() -; I64-NEXT: %1 = call i64 @__gen_ocl_get_group_id1() -; I64-NEXT: %2 = call i64 @__gen_ocl_get_local_id0() -; I64-NEXT: %3 = call i64 @__gen_ocl_get_local_id1() -; I64-NEXT: br label %polly.loop_preheader - - -; void double_parallel_loop(float A[][1024]) { -; for (long i = 0; i < 1024; i++) -; for (long j = 0; j < 1024; j++) -; A[i][j] += i * j; -; } -; - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @double_parallel_loop(ptr %A) { -bb: - br label %bb2 - -bb2: ; preds = %bb13, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ] - %exitcond1 = icmp ne i64 %i.0, 1024 - br i1 %exitcond1, label %bb3, label %bb15 - -bb3: ; preds = %bb2 - br label %bb4 - -bb4: ; preds = %bb10, %bb3 - %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ] - %exitcond = icmp ne i64 %j.0, 1024 - br i1 %exitcond, label %bb5, label %bb12 - -bb5: ; preds = %bb4 - %tmp = mul nuw nsw i64 %i.0, %j.0 - %tmp6 = sitofp i64 %tmp to float - %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0 - %tmp8 = load float, ptr %tmp7, align 4 - %tmp9 = fadd float %tmp8, %tmp6 - store float %tmp9, ptr %tmp7, align 4 - br label %bb10 - -bb10: ; preds = %bb5 - %tmp11 = add nuw nsw i64 %j.0, 1 - br label %bb4 - -bb12: ; preds = %bb4 - br label %bb13 - -bb13: ; preds = %bb12 - %tmp14 = add nuw nsw i64 %i.0, 1 - br label %bb2 - -bb15: ; preds = %bb2 - ret void -} diff --git a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll deleted file mode 100644 index 6fd14cbfbcd1..000000000000 --- a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s - -; Check that we do not create a kernel if there is an -; unknown function call in a candidate kernel. - -; Check that we model the kernel as a scop. -; SCOP: Function: f -; SCOP-NEXT: Region: %entry.split---%for.end13 - -; If a kernel were generated, then this code would have been part of the kernel -; and not the `.ll` file that is generated. -; CHECK: %conv = fpext float %0 to double -; CHECK-NEXT: %1 = tail call double @extern.fn(double %conv) -; CHECK-NEXT: %conv6 = fptrunc double %1 to float - -; REQUIRES: pollyacc - -; static const int N = 1000; -; void f(float A[N][N], int n, float B[N][N]) { -; for(int i = 0; i < n; i++) { -; for(int j = 0; j < n; j++) { -; B[i][j] = extern_fn(A[i][j], 3); -; } -; -; } -; } - -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-apple-macosx10.11.0" - -define void @f(ptr %A, i32 %n, ptr %B) { -entry: - br label %entry.split - -entry.split: ; preds = %entry - %cmp3 = icmp sgt i32 %n, 0 - br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13 - -for.cond1.preheader.lr.ph: ; preds = %entry.split - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.inc11 - %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ] - %cmp21 = icmp sgt i32 %n, 0 - br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11 - -for.body3.lr.ph: ; preds = %for.cond1.preheader - br label %for.body3 - -for.body3: ; preds = %for.body3.lr.ph, %for.body3 - %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ] - %arrayidx5 = getelementptr inbounds [1000 x float], ptr %A, i64 %indvars.iv5, i64 %indvars.iv - %0 = load float, ptr %arrayidx5, align 4 - %conv = fpext float %0 to double - %1 = tail call double @extern.fn(double %conv) - %conv6 = fptrunc double %1 to float - %arrayidx10 = getelementptr inbounds [1000 x float], ptr %B, i64 %indvars.iv5, i64 %indvars.iv - store float %conv6, ptr %arrayidx10, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %wide.trip.count = zext i32 %n to i64 - %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge - -for.cond1.for.inc11_crit_edge: ; preds = %for.body3 - br label %for.inc11 - -for.inc11: ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader - %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1 - %wide.trip.count7 = zext i32 %n to i64 - %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7 - br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge - -for.cond.for.end13_crit_edge: ; preds = %for.inc11 - br label %for.end13 - -for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry.split - ret void -} - -declare double @extern.fn(double) #0 -attributes #0 = { readnone } diff --git a/polly/test/GPGPU/untouched-arrays.ll b/polly/test/GPGPU/untouched-arrays.ll deleted file mode 100644 index 5c7e0c7b543b..000000000000 --- a/polly/test/GPGPU/untouched-arrays.ll +++ /dev/null @@ -1,270 +0,0 @@ -; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ -; RUN: -disable-output < %s | \ -; RUN: FileCheck -check-prefix=CODE %s - -; REQUIRES: pollyacc - -; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(10); -; CODE-NEXT: dim3 k0_dimGrid(1); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_global_1); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost)); -; CODE: cudaCheckReturn(cudaFree(dev_MemRef_global_1)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb33(t0, 0); - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%struct.hoge = type { [23 x i16], [22 x i16], [14 x i16], [13 x i16] } - -@global = external global [9 x %struct.hoge], align 16 -@global.1 = external global [9 x [152 x i32]], align 16 - -; Function Attrs: nounwind uwtable -define void @widget() #0 { -bb: - br label %bb1 - -bb1: ; preds = %bb1, %bb - br i1 undef, label %bb1, label %bb2 - -bb2: ; preds = %bb2, %bb1 - br i1 undef, label %bb2, label %bb3 - -bb3: ; preds = %bb3, %bb2 - br i1 undef, label %bb3, label %bb4 - -bb4: ; preds = %bb4, %bb3 - br i1 undef, label %bb4, label %bb5 - -bb5: ; preds = %bb5, %bb4 - br i1 undef, label %bb5, label %bb6 - -bb6: ; preds = %bb6, %bb5 - br i1 undef, label %bb6, label %bb7 - -bb7: ; preds = %bb7, %bb6 - br i1 undef, label %bb7, label %bb8 - -bb8: ; preds = %bb8, %bb7 - br i1 undef, label %bb8, label %bb9 - -bb9: ; preds = %bb8 - br label %bb10 - -bb10: ; preds = %bb12, %bb9 - br label %bb11 - -bb11: ; preds = %bb11, %bb10 - br i1 undef, label %bb11, label %bb12 - -bb12: ; preds = %bb11 - br i1 undef, label %bb10, label %bb13 - -bb13: ; preds = %bb18, %bb12 - br i1 undef, label %bb16, label %bb14 - -bb14: ; preds = %bb16, %bb13 - br i1 undef, label %bb15, label %bb18 - -bb15: ; preds = %bb14 - br label %bb17 - -bb16: ; preds = %bb16, %bb13 - br i1 undef, label %bb16, label %bb14 - -bb17: ; preds = %bb17, %bb15 - br i1 undef, label %bb17, label %bb18 - -bb18: ; preds = %bb17, %bb14 - br i1 undef, label %bb13, label %bb19 - -bb19: ; preds = %bb25, %bb18 - br label %bb20 - -bb20: ; preds = %bb24, %bb19 - br i1 undef, label %bb21, label %bb24 - -bb21: ; preds = %bb20 - br i1 undef, label %bb23, label %bb22 - -bb22: ; preds = %bb21 - br label %bb24 - -bb23: ; preds = %bb21 - br label %bb24 - -bb24: ; preds = %bb23, %bb22, %bb20 - br i1 undef, label %bb20, label %bb25 - -bb25: ; preds = %bb24 - br i1 undef, label %bb19, label %bb26 - -bb26: ; preds = %bb56, %bb25 - %tmp = phi ptr [ undef, %bb56 ], [ getelementptr inbounds ([9 x [152 x i32]], ptr @global.1, i64 0, i64 0, i64 32), %bb25 ] - br label %bb27 - -bb27: ; preds = %bb27, %bb26 - br i1 undef, label %bb27, label %bb28 - -bb28: ; preds = %bb27 - br label %bb30 - -bb30: ; preds = %bb38, %bb28 - %tmp31 = phi i32 [ 3, %bb28 ], [ %tmp40, %bb38 ] - %tmp32 = phi ptr [ %tmp, %bb28 ], [ %tmp39, %bb38 ] - br label %bb33 - -bb33: ; preds = %bb33, %bb30 - %tmp34 = phi i32 [ 0, %bb30 ], [ %tmp37, %bb33 ] - %tmp35 = phi ptr [ %tmp32, %bb30 ], [ undef, %bb33 ] - %tmp36 = getelementptr inbounds i32, ptr %tmp35, i64 1 - store i32 undef, ptr %tmp36, align 4, !tbaa !1 - %tmp37 = add nuw nsw i32 %tmp34, 1 - br i1 false, label %bb33, label %bb38 - -bb38: ; preds = %bb33 - %tmp39 = getelementptr i32, ptr %tmp32, i64 12 - %tmp40 = add nuw nsw i32 %tmp31, 1 - %tmp41 = icmp ne i32 %tmp40, 13 - br i1 %tmp41, label %bb30, label %bb42 - -bb42: ; preds = %bb38 - %tmp43 = getelementptr inbounds [9 x %struct.hoge], ptr @global, i64 0, i64 0, i32 3, i64 0 - br label %bb44 - -bb44: ; preds = %bb51, %bb42 - %tmp45 = phi i32 [ 0, %bb42 ], [ %tmp52, %bb51 ] - %tmp46 = phi ptr [ %tmp43, %bb42 ], [ undef, %bb51 ] - %tmp47 = load i16, ptr %tmp46, align 2, !tbaa !5 - br label %bb48 - -bb48: ; preds = %bb48, %bb44 - %tmp49 = phi i32 [ 0, %bb44 ], [ %tmp50, %bb48 ] - %tmp50 = add nuw nsw i32 %tmp49, 1 - br i1 false, label %bb48, label %bb51 - -bb51: ; preds = %bb48 - %tmp52 = add nuw nsw i32 %tmp45, 1 - %tmp53 = icmp ne i32 %tmp52, 13 - br i1 %tmp53, label %bb44, label %bb54 - -bb54: ; preds = %bb51 - br label %bb55 - -bb55: ; preds = %bb55, %bb54 - br i1 undef, label %bb55, label %bb56 - -bb56: ; preds = %bb55 - br i1 undef, label %bb26, label %bb57 - -bb57: ; preds = %bb60, %bb56 - br label %bb58 - -bb58: ; preds = %bb58, %bb57 - br i1 undef, label %bb58, label %bb59 - -bb59: ; preds = %bb59, %bb58 - br i1 undef, label %bb59, label %bb60 - -bb60: ; preds = %bb59 - br i1 undef, label %bb57, label %bb61 - -bb61: ; preds = %bb65, %bb60 - br label %bb62 - -bb62: ; preds = %bb64, %bb61 - br label %bb63 - -bb63: ; preds = %bb63, %bb62 - br i1 undef, label %bb63, label %bb64 - -bb64: ; preds = %bb63 - br i1 undef, label %bb62, label %bb65 - -bb65: ; preds = %bb64 - br i1 undef, label %bb61, label %bb66 - -bb66: ; preds = %bb70, %bb65 - br label %bb67 - -bb67: ; preds = %bb69, %bb66 - br label %bb68 - -bb68: ; preds = %bb68, %bb67 - br i1 undef, label %bb68, label %bb69 - -bb69: ; preds = %bb68 - br i1 undef, label %bb67, label %bb70 - -bb70: ; preds = %bb69 - br i1 undef, label %bb66, label %bb71 - -bb71: ; preds = %bb73, %bb70 - br label %bb72 - -bb72: ; preds = %bb72, %bb71 - br i1 undef, label %bb72, label %bb73 - -bb73: ; preds = %bb72 - br i1 undef, label %bb71, label %bb74 - -bb74: ; preds = %bb80, %bb73 - br label %bb75 - -bb75: ; preds = %bb79, %bb74 - br label %bb76 - -bb76: ; preds = %bb78, %bb75 - br label %bb77 - -bb77: ; preds = %bb77, %bb76 - br i1 undef, label %bb77, label %bb78 - -bb78: ; preds = %bb77 - br i1 undef, label %bb76, label %bb79 - -bb79: ; preds = %bb78 - br i1 undef, label %bb75, label %bb80 - -bb80: ; preds = %bb79 - br i1 undef, label %bb74, label %bb81 - -bb81: ; preds = %bb85, %bb80 - br label %bb82 - -bb82: ; preds = %bb84, %bb81 - br label %bb83 - -bb83: ; preds = %bb83, %bb82 - br i1 undef, label %bb83, label %bb84 - -bb84: ; preds = %bb83 - br i1 undef, label %bb82, label %bb85 - -bb85: ; preds = %bb84 - br i1 undef, label %bb81, label %bb86 - -bb86: ; preds = %bb85 - ret void -} - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 4.0.0"} -!1 = !{!2, !2, i64 0} -!2 = !{!"int", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{!6, !6, i64 0} -!6 = !{!"short", !3, i64 0} diff --git a/polly/test/Unit/lit.site.cfg.in b/polly/test/Unit/lit.site.cfg.in index 2aeaf197f06c..a93b8b7a527b 100644 --- a/polly/test/Unit/lit.site.cfg.in +++ b/polly/test/Unit/lit.site.cfg.in @@ -11,7 +11,6 @@ config.polly_obj_root = "@POLLY_BINARY_DIR@" config.polly_lib_dir = "@POLLY_LIB_DIR@" config.shlibdir = "@SHLIBDIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" -config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.has_unittests = @POLLY_GTEST_AVAIL@ diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg index 41e3a589c61e..0943507ebe50 100644 --- a/polly/test/lit.cfg +++ b/polly/test/lit.cfg @@ -70,6 +70,4 @@ except OSError: print("Could not find llvm-config in " + config.llvm_tools_dir) exit(42) -if re.search(r'NVPTX', llvm_config_cmd.stdout.read().decode('ascii')): - config.available_features.add('nvptx-registered-target') llvm_config_cmd.wait() diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in index 4aed9875c3fb..b44061260834 100644 --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -7,7 +7,6 @@ config.llvm_libs_dir = lit_config.substitute("@LLVM_LIBS_DIR@") config.polly_obj_root = "@POLLY_BINARY_DIR@" config.polly_lib_dir = "@POLLY_LIB_DIR@" config.target_triple = "@LLVM_TARGET_TRIPLE@" -config.enable_gpgpu_codegen = "@GPU_CODEGEN@" config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@" config.targets_to_build = "@TARGETS_TO_BUILD@" config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";") @@ -50,9 +49,6 @@ else: config.substitutions.append(('%loadNPMPolly', commonOpts )) -if config.enable_gpgpu_codegen == 'TRUE' : - config.available_features.add('pollyacc') - import lit.llvm lit.llvm.initialize(lit_config, config) |